Final Project Submission¶
Please fill out:
- Student name:
- Student pace: self paced / part time / full time
- Scheduled project review date/time:
- Instructor name:
- Blog post URL:
In [ ]:
import pandas as pd
tsv_df = pd.read_csv('test_set_values.csv')
tsv_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | longitude | latitude | wpt_name | num_private | ... | payment_type | water_quality | quality_group | quantity | quantity_group | source | source_type | source_class | waterpoint_type | waterpoint_type_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50785 | 0.0 | 2013-02-04 | Dmdd | 1996 | DMDD | 35.290799 | -4.059696 | Dinamu Secondary School | 0 | ... | never pay | soft | good | seasonal | seasonal | rainwater harvesting | rainwater harvesting | surface | other | other |
| 1 | 51630 | 0.0 | 2013-02-04 | Government Of Tanzania | 1569 | DWE | 36.656709 | -3.309214 | Kimnyak | 0 | ... | never pay | soft | good | insufficient | insufficient | spring | spring | groundwater | communal standpipe | communal standpipe |
| 2 | 17168 | 0.0 | 2013-02-01 | NaN | 1567 | NaN | 34.767863 | -5.004344 | Puma Secondary | 0 | ... | never pay | soft | good | insufficient | insufficient | rainwater harvesting | rainwater harvesting | surface | other | other |
| 3 | 45559 | 0.0 | 2013-01-22 | Finn Water | 267 | FINN WATER | 38.058046 | -9.418672 | Kwa Mzee Pange | 0 | ... | unknown | soft | good | dry | dry | shallow well | shallow well | groundwater | other | other |
| 4 | 49871 | 500.0 | 2013-03-27 | Bruder | 1260 | BRUDER | 35.006123 | -10.950412 | Kwa Mzee Turuka | 0 | ... | monthly | soft | good | enough | enough | spring | spring | groundwater | communal standpipe | communal standpipe |
5 rows × 40 columns
In [ ]:
tsl_df = pd.read_csv('training_set_labels.csv')
tsl_df.head()
Out[ ]:
| id | status_group | |
|---|---|---|
| 0 | 69572 | functional |
| 1 | 8776 | functional |
| 2 | 34310 | functional |
| 3 | 67743 | non functional |
| 4 | 19728 | functional |
In [ ]:
training_values = pd.read_csv('training_set_values.csv')
training_values.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | longitude | latitude | wpt_name | num_private | ... | payment_type | water_quality | quality_group | quantity | quantity_group | source | source_type | source_class | waterpoint_type | waterpoint_type_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | 34.938093 | -9.856322 | none | 0 | ... | annually | soft | good | enough | enough | spring | spring | groundwater | communal standpipe | communal standpipe |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | 34.698766 | -2.147466 | Zahanati | 0 | ... | never pay | soft | good | insufficient | insufficient | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | 37.460664 | -3.821329 | Kwa Mahundi | 0 | ... | per bucket | soft | good | enough | enough | dam | dam | surface | communal standpipe multiple | communal standpipe |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | 38.486161 | -11.155298 | Zahanati Ya Nanyumbu | 0 | ... | never pay | soft | good | dry | dry | machine dbh | borehole | groundwater | communal standpipe multiple | communal standpipe |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | 31.130847 | -1.825359 | Shuleni | 0 | ... | never pay | soft | good | seasonal | seasonal | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe |
5 rows × 40 columns
In [ ]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import RocCurveDisplay, roc_curve, roc_auc_score, auc
from sklearn.tree import DecisionTreeClassifier
In [ ]:
from scipy import stats
In [ ]:
plt.figure(figsize=(12, 6))
sns.barplot(x='construction_year', y='count', data=construction_top)
plt.xticks(rotation=90)
plt.xlabel("Construction Year")
plt.ylabel("Count of Wells")
plt.title("Number of Wells Constructed per Year")
# Show plot
plt.show()
In [ ]:
waterfalls_df.head()
Out[ ]:
| id | amount_tsh | funder | gps_height | installer | basin | region | region_code | district_code | lga | ... | extraction_type_class | management | management_group | payment_type | quality_group | quantity | source_type | source_class | waterpoint_type_group | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 11 | 5 | Ludewa | ... | gravity | vwc | user-group | annually | good | enough | spring | groundwater | communal standpipe | functional |
| 2 | 34310 | 25.0 | lottery club | 686 | world vision | pangani | manyara | 21 | 4 | Simanjiro | ... | gravity | vwc | user-group | per bucket | good | enough | dam | surface | communal standpipe | functional |
| 3 | 67743 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 90 | 63 | Nanyumbu | ... | submersible | vwc | user-group | never pay | good | dry | borehole | groundwater | communal standpipe | non functional |
| 5 | 9944 | 20.0 | mkinga distric coun | 0 | dwe | pangani | tanga | 4 | 8 | Mkinga | ... | submersible | vwc | user-group | per bucket | salty | enough | other | unknown | communal standpipe | functional |
| 6 | 19816 | 0.0 | dwsp | 0 | dwsp | internal | shinyanga | 17 | 3 | Shinyanga Rural | ... | handpump | vwc | user-group | never pay | good | enough | borehole | groundwater | hand pump | non functional |
5 rows × 25 columns
In [ ]:
print(waterfalls_df.shape) # Should show (rows, columns)
print(waterfalls_df.isnull().sum().sum()) # Should be 0 or low
(48650, 25) 0
In [ ]:
training_df = pd.merge(training_values, tsl_df, on='id')
training_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | longitude | latitude | wpt_name | num_private | ... | water_quality | quality_group | quantity | quantity_group | source | source_type | source_class | waterpoint_type | waterpoint_type_group | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | 34.938093 | -9.856322 | none | 0 | ... | soft | good | enough | enough | spring | spring | groundwater | communal standpipe | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | 34.698766 | -2.147466 | Zahanati | 0 | ... | soft | good | insufficient | insufficient | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | 37.460664 | -3.821329 | Kwa Mahundi | 0 | ... | soft | good | enough | enough | dam | dam | surface | communal standpipe multiple | communal standpipe | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | 38.486161 | -11.155298 | Zahanati Ya Nanyumbu | 0 | ... | soft | good | dry | dry | machine dbh | borehole | groundwater | communal standpipe multiple | communal standpipe | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | 31.130847 | -1.825359 | Shuleni | 0 | ... | soft | good | seasonal | seasonal | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe | functional |
5 rows × 41 columns
In [ ]:
full_df = pd.concat([training_df, tsv_df])
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | longitude | latitude | wpt_name | num_private | ... | water_quality | quality_group | quantity | quantity_group | source | source_type | source_class | waterpoint_type | waterpoint_type_group | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | 34.938093 | -9.856322 | none | 0 | ... | soft | good | enough | enough | spring | spring | groundwater | communal standpipe | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | 34.698766 | -2.147466 | Zahanati | 0 | ... | soft | good | insufficient | insufficient | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | 37.460664 | -3.821329 | Kwa Mahundi | 0 | ... | soft | good | enough | enough | dam | dam | surface | communal standpipe multiple | communal standpipe | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | 38.486161 | -11.155298 | Zahanati Ya Nanyumbu | 0 | ... | soft | good | dry | dry | machine dbh | borehole | groundwater | communal standpipe multiple | communal standpipe | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | 31.130847 | -1.825359 | Shuleni | 0 | ... | soft | good | seasonal | seasonal | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe | functional |
5 rows × 41 columns
In [ ]:
numerical = full_df.select_dtypes(include= ['int', 'Int64', 'float']).columns.to_list()
numerical
Out[ ]:
['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year']
In [ ]:
types = pd.DataFrame(
full_df[numerical].dtypes,
columns=['Data Type']
)
types
Out[ ]:
| Data Type | |
|---|---|
| id | int64 |
| amount_tsh | float64 |
| gps_height | int64 |
| longitude | float64 |
| latitude | float64 |
| num_private | int64 |
| region_code | int64 |
| district_code | int64 |
| population | int64 |
| construction_year | int64 |
In [ ]:
print(full_df.shape)
print(full_df.isnull().sum().sum())
(74250, 41) 73208 73208
In [ ]:
full_df = full_df.drop(['num_private', 'longitude', 'latitude', 'region_code', 'district_code'], axis=1)
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | wpt_name | basin | subvillage | region | ... | water_quality | quality_group | quantity | quantity_group | source | source_type | source_class | waterpoint_type | waterpoint_type_group | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | none | Lake Nyasa | Mnyusi B | Iringa | ... | soft | good | enough | enough | spring | spring | groundwater | communal standpipe | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Zahanati | Lake Victoria | Nyamara | Mara | ... | soft | good | insufficient | insufficient | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Kwa Mahundi | Pangani | Majengo | Manyara | ... | soft | good | enough | enough | dam | dam | surface | communal standpipe multiple | communal standpipe | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Zahanati Ya Nanyumbu | Ruvuma / Southern Coast | Mahakamani | Mtwara | ... | soft | good | dry | dry | machine dbh | borehole | groundwater | communal standpipe multiple | communal standpipe | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Shuleni | Lake Victoria | Kyanyamisa | Kagera | ... | soft | good | seasonal | seasonal | rainwater harvesting | rainwater harvesting | surface | communal standpipe | communal standpipe | functional |
5 rows × 36 columns
In [ ]:
print(full_df.isnull().sum())
id 0 amount_tsh 0 date_recorded 0 funder 4507 gps_height 0 installer 4532 wpt_name 2 basin 0 subvillage 470 region 0 lga 0 ward 0 population 0 public_meeting 4155 recorded_by 0 scheme_management 4847 scheme_name 36052 permit 3793 construction_year 0 extraction_type 0 extraction_type_group 0 extraction_type_class 0 management 0 management_group 0 payment 0 payment_type 0 water_quality 0 quality_group 0 quantity 0 quantity_group 0 source 0 source_type 0 source_class 0 waterpoint_type 0 waterpoint_type_group 0 status_group 14850 dtype: int64
In [ ]:
full_df = full_df.drop(['waterpoint_type_group', 'quality_group', 'quantity_group', 'source_type', 'subvillage', 'wpt_name'], axis=1)
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | lga | ward | ... | management | management_group | payment | payment_type | water_quality | quantity | source | source_class | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | Ludewa | Mundindi | ... | vwc | user-group | pay annually | annually | soft | enough | spring | groundwater | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | Serengeti | Natta | ... | wug | user-group | never pay | never pay | soft | insufficient | rainwater harvesting | surface | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | Simanjiro | Ngorika | ... | vwc | user-group | pay per bucket | per bucket | soft | enough | dam | surface | communal standpipe multiple | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | Nanyumbu | Nanyumbu | ... | vwc | user-group | never pay | never pay | soft | dry | machine dbh | groundwater | communal standpipe multiple | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | Karagwe | Nyakasimbi | ... | other | other | never pay | never pay | soft | seasonal | rainwater harvesting | surface | communal standpipe | functional |
5 rows × 30 columns
In [ ]:
full_df = full_df.drop('recorded_by', axis=1)
In [ ]:
print(full_df.isnull().sum().sum())
72736
In [ ]:
print("Duplicate Rows:", full_df.duplicated().sum())
Duplicate Rows: 0
In [ ]:
full_df = full_df.drop(['public_meeting', 'scheme_management'], axis=1)
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | lga | ward | ... | management | management_group | payment | payment_type | water_quality | quantity | source | source_class | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | Ludewa | Mundindi | ... | vwc | user-group | pay annually | annually | soft | enough | spring | groundwater | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | Serengeti | Natta | ... | wug | user-group | never pay | never pay | soft | insufficient | rainwater harvesting | surface | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | Simanjiro | Ngorika | ... | vwc | user-group | pay per bucket | per bucket | soft | enough | dam | surface | communal standpipe multiple | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | Nanyumbu | Nanyumbu | ... | vwc | user-group | never pay | never pay | soft | dry | machine dbh | groundwater | communal standpipe multiple | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | Karagwe | Nyakasimbi | ... | other | other | never pay | never pay | soft | seasonal | rainwater harvesting | surface | communal standpipe | functional |
5 rows × 27 columns
In [ ]:
full_df = full_df.drop(['extraction_type_group', 'payment_type', 'source_class'], axis=1)
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | lga | ward | ... | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | Ludewa | Mundindi | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | Serengeti | Natta | ... | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | Simanjiro | Ngorika | ... | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | Nanyumbu | Nanyumbu | ... | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | Karagwe | Nyakasimbi | ... | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | functional |
5 rows × 24 columns
In [ ]:
full_df =full_df.drop(['lga', 'ward'], axis=1)
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | scheme_name | ... | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | 109 | Roman | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | 280 | NaN | ... | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | 250 | Nyumba ya mungu pipe scheme | ... | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | 58 | NaN | ... | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | 0 | NaN | ... | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | functional |
5 rows × 22 columns
In [ ]:
print(full_df.columns[full_df.columns.duplicated()])
Index([], dtype='object')
In [ ]:
print(full_df.index.duplicated().sum())
14850
In [ ]:
full_df
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | scheme_name | ... | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | 109 | Roman | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | functional |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | 280 | NaN | ... | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | functional |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | 250 | Nyumba ya mungu pipe scheme | ... | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | functional |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | 58 | NaN | ... | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | non functional |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | 0 | NaN | ... | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | functional |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14845 | 39307 | 0.0 | 2011-02-24 | Danida | 34 | Da | Wami / Ruvu | Pwani | 20 | Bagamoyo wate | ... | mono | motorpump | vwc | user-group | never pay | soft | enough | river | communal standpipe | NaN |
| 14846 | 18990 | 1000.0 | 2011-03-21 | Hiap | 0 | HIAP | Pangani | Tanga | 2960 | NaN | ... | nira/tanira | handpump | vwc | user-group | pay annually | salty | insufficient | shallow well | hand pump | NaN |
| 14847 | 28749 | 0.0 | 2013-03-04 | NaN | 1476 | NaN | Internal | Singida | 200 | NaN | ... | gravity | gravity | vwc | user-group | never pay | soft | insufficient | dam | communal standpipe | NaN |
| 14848 | 33492 | 0.0 | 2013-02-18 | Germany | 998 | DWE | Lake Nyasa | Ruvuma | 150 | Mradi wa maji wa maposeni | ... | gravity | gravity | vwc | user-group | never pay | soft | insufficient | river | communal standpipe | NaN |
| 14849 | 68707 | 0.0 | 2013-02-13 | Government Of Tanzania | 481 | Government | Lake Nyasa | Ruvuma | 40 | DANIDA | ... | gravity | gravity | vwc | user-group | never pay | soft | dry | spring | communal standpipe | NaN |
74250 rows × 22 columns
In [ ]:
full_df = full_df.reset_index(drop=True)
In [ ]:
sns.countplot(x='permit', hue='status_group', data=full_df)
plt.title("Keep Permit Column?")
plt.show()
In [ ]:
backup_df = full_df
In [ ]:
backup_df['status_group'] = full_df['status_group'].astype('category').cat.codes
backup_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | scheme_name | ... | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | 109 | Roman | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | 280 | NaN | ... | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | 250 | Nyumba ya mungu pipe scheme | ... | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | 58 | NaN | ... | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | 0 | NaN | ... | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 |
5 rows × 22 columns
In [ ]:
full_df['permit'] = full_df['permit'].astype('category').cat.codes
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | scheme_name | ... | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | 109 | Roman | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | 280 | NaN | ... | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | 250 | Nyumba ya mungu pipe scheme | ... | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | 58 | NaN | ... | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | 0 | NaN | ... | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 |
5 rows × 22 columns
In [ ]:
full_df = full_df[full_df['status_group'] != -1]
print(f"Remaining rows with status_group = -1: {(full_df['status_group'] == -1).sum()}")
print(f"New dataset shape: {full_df.shape}")
Remaining rows with status_group = -1: 0 New dataset shape: (59400, 22)
In [ ]:
numeric_df = full_df.select_dtypes(include=['number'])
print(numeric_df.corr()['status_group'])
id 0.004049 amount_tsh -0.053702 gps_height -0.114029 population -0.017759 permit -0.019728 construction_year -0.043342 status_group 1.000000 Name: status_group, dtype: float64
In [ ]:
print(full_df.dtypes)
id int64 amount_tsh float64 date_recorded object funder object gps_height int64 installer object basin object region object population int64 scheme_name object permit int8 construction_year int64 extraction_type object extraction_type_class object management object management_group object payment object water_quality object quantity object source object waterpoint_type object status_group int8 dtype: object
In [ ]:
full_df = full_df.drop('permit', axis=1)
full_df
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | scheme_name | ... | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | Roman | 1390 | Roman | Lake Nyasa | Iringa | 109 | Roman | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 |
| 1 | 8776 | 0.0 | 2013-03-06 | Grumeti | 1399 | GRUMETI | Lake Victoria | Mara | 280 | NaN | ... | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 |
| 2 | 34310 | 25.0 | 2013-02-25 | Lottery Club | 686 | World vision | Pangani | Manyara | 250 | Nyumba ya mungu pipe scheme | ... | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 |
| 3 | 67743 | 0.0 | 2013-01-28 | Unicef | 263 | UNICEF | Ruvuma / Southern Coast | Mtwara | 58 | NaN | ... | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 |
| 4 | 19728 | 0.0 | 2011-07-13 | Action In A | 0 | Artisan | Lake Victoria | Kagera | 0 | NaN | ... | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 59395 | 60739 | 10.0 | 2013-05-03 | Germany Republi | 1210 | CES | Pangani | Kilimanjaro | 125 | Losaa Kia water supply | ... | gravity | gravity | water board | user-group | pay per bucket | soft | enough | spring | communal standpipe | 0 |
| 59396 | 27263 | 4700.0 | 2011-05-07 | Cefa-njombe | 1212 | Cefa | Rufiji | Iringa | 56 | Ikondo electrical water sch | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | river | communal standpipe | 0 |
| 59397 | 37057 | 0.0 | 2011-04-11 | NaN | 0 | NaN | Rufiji | Mbeya | 0 | NaN | ... | swn 80 | handpump | vwc | user-group | pay monthly | fluoride | enough | machine dbh | hand pump | 0 |
| 59398 | 31282 | 0.0 | 2011-03-08 | Malec | 0 | Musa | Rufiji | Dodoma | 0 | NaN | ... | nira/tanira | handpump | vwc | user-group | never pay | soft | insufficient | shallow well | hand pump | 0 |
| 59399 | 26348 | 0.0 | 2011-03-23 | World Bank | 191 | World | Wami / Ruvu | Morogoro | 150 | NaN | ... | nira/tanira | handpump | vwc | user-group | pay when scheme fails | salty | enough | shallow well | hand pump | 0 |
59400 rows × 21 columns
In [ ]:
categorical = full_df.select_dtypes(include= ['object']).columns.to_list()
categorical
Out[ ]:
['date_recorded', 'funder', 'installer', 'basin', 'region', 'scheme_name', 'extraction_type', 'extraction_type_class', 'management', 'management_group', 'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type']
In [ ]:
types = pd.DataFrame(
full_df[categorical].dtypes,
columns=['Data Type']
)
types
Out[ ]:
| Data Type | |
|---|---|
| date_recorded | object |
| funder | object |
| installer | object |
| basin | object |
| region | object |
| scheme_name | object |
| extraction_type | object |
| extraction_type_class | object |
| management | object |
| management_group | object |
| payment | object |
| water_quality | object |
| quantity | object |
| source | object |
| waterpoint_type | object |
In [ ]:
for col in categorical:
full_df[col] = full_df[col].str.lower().str.strip()
In [ ]:
for col in categorical:
print(f"Unique values in {col}:\n", full_df[col].value_counts(), "\n")
Unique values in date_recorded:
date_recorded
2011-03-15 572
2011-03-17 558
2013-02-03 546
2011-03-14 520
2011-03-16 513
...
2011-09-11 1
2011-08-31 1
2011-09-21 1
2011-08-30 1
2013-12-01 1
Name: count, Length: 356, dtype: int64
Unique values in funder:
funder
government of tanzania 9084
danida 3114
hesawa 2202
rwssp 1374
world bank 1349
...
muwasa 1
msigw 1
rc mofu 1
overland high school 1
samlo 1
Name: count, Length: 1896, dtype: int64
Unique values in installer:
installer
dwe 17405
government 1891
hesawa 1395
rwe 1206
commu 1065
...
amadi 1
jafary mbaga 1
sa 1
luleka 1
selepta 1
Name: count, Length: 1935, dtype: int64
Unique values in basin:
basin
lake victoria 10248
pangani 8940
rufiji 7976
internal 7785
lake tanganyika 6432
wami / ruvu 5987
lake nyasa 5085
ruvuma / southern coast 4493
lake rukwa 2454
Name: count, dtype: int64
Unique values in region:
region
iringa 5294
shinyanga 4982
mbeya 4639
kilimanjaro 4379
morogoro 4006
arusha 3350
kagera 3316
mwanza 3102
kigoma 2816
ruvuma 2640
pwani 2635
tanga 2547
dodoma 2201
singida 2093
mara 1969
tabora 1959
rukwa 1808
mtwara 1730
manyara 1583
lindi 1546
dar es salaam 805
Name: count, dtype: int64
Unique values in scheme_name:
scheme_name
k 685
borehole 546
chalinze wate 406
m 400
danida 379
...
kifaru/kituri 1
merali juu line 1
ukuu 1
kizimba forest 1
mtawanya 1
Name: count, Length: 2576, dtype: int64
Unique values in extraction_type:
extraction_type
gravity 26780
nira/tanira 8154
other 6430
submersible 4764
swn 80 3670
mono 2865
india mark ii 2400
afridev 1770
ksb 1415
other - rope pump 451
other - swn 81 229
windmill 117
india mark iii 98
cemo 90
other - play pump 85
walimi 48
climax 32
other - mkulima/shinyanga 2
Name: count, dtype: int64
Unique values in extraction_type_class:
extraction_type_class
gravity 26780
handpump 16456
other 6430
submersible 6179
motorpump 2987
rope pump 451
wind-powered 117
Name: count, dtype: int64
Unique values in management:
management
vwc 40507
wug 6515
water board 2933
wua 2535
private operator 1971
parastatal 1768
water authority 904
other 844
company 685
unknown 561
other - school 99
trust 78
Name: count, dtype: int64
Unique values in management_group:
management_group
user-group 52490
commercial 3638
parastatal 1768
other 943
unknown 561
Name: count, dtype: int64
Unique values in payment:
payment
never pay 25348
pay per bucket 8985
pay monthly 8300
unknown 8157
pay when scheme fails 3914
pay annually 3642
other 1054
Name: count, dtype: int64
Unique values in water_quality:
water_quality
soft 50818
salty 4856
unknown 1876
milky 804
coloured 490
salty abandoned 339
fluoride 200
fluoride abandoned 17
Name: count, dtype: int64
Unique values in quantity:
quantity
enough 33186
insufficient 15129
dry 6246
seasonal 4050
unknown 789
Name: count, dtype: int64
Unique values in source:
source
spring 17021
shallow well 16824
machine dbh 11075
river 9612
rainwater harvesting 2295
hand dtw 874
lake 765
dam 656
other 212
unknown 66
Name: count, dtype: int64
Unique values in waterpoint_type:
waterpoint_type
communal standpipe 28522
hand pump 17488
other 6380
communal standpipe multiple 6103
improved spring 784
cattle trough 116
dam 7
Name: count, dtype: int64
In [ ]:
full_df[categorical] = full_df[categorical].fillna("unknown")
In [ ]:
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | scheme_name | ... | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | roman | 1390 | roman | lake nyasa | iringa | 109 | roman | ... | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 |
| 1 | 8776 | 0.0 | 2013-03-06 | grumeti | 1399 | grumeti | lake victoria | mara | 280 | unknown | ... | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 |
| 2 | 34310 | 25.0 | 2013-02-25 | lottery club | 686 | world vision | pangani | manyara | 250 | nyumba ya mungu pipe scheme | ... | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 |
| 3 | 67743 | 0.0 | 2013-01-28 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 58 | unknown | ... | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 |
| 4 | 19728 | 0.0 | 2011-07-13 | action in a | 0 | artisan | lake victoria | kagera | 0 | unknown | ... | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 |
5 rows × 21 columns
In [ ]:
region_counts = full_df[full_df["status_group"] == 0]["region"].value_counts().reset_index()
region_counts.columns = ["region", "count"]
plt.figure(figsize=(10, 5))
sns.barplot(x="region", y="count", data=region_counts, palette="Reds_r")
plt.xlabel("Region")
plt.ylabel("Well Functionality Count")
plt.title("Functionality by Region")
plt.xticks(rotation=90)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/2272151946.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x="region", y="count", data=region_counts, palette="Reds_r")
In [ ]:
full_df['region'].value_counts()
Out[ ]:
region iringa 5294 shinyanga 4982 mbeya 4639 kilimanjaro 4379 morogoro 4006 arusha 3350 kagera 3316 mwanza 3102 kigoma 2816 ruvuma 2640 pwani 2635 tanga 2547 dodoma 2201 singida 2093 mara 1969 tabora 1959 rukwa 1808 mtwara 1730 manyara 1583 lindi 1546 dar es salaam 805 Name: count, dtype: int64
In [ ]:
basin_counts = full_df[full_df["status_group"] == 0]["basin"].value_counts().reset_index()
basin_counts.columns = ["basin", "count"]
plt.figure(figsize=(10, 6))
sns.barplot(x=full_df["basin"], y=full_df["status_group"], palette="Blues_r")
plt.xlabel("Basin")
plt.ylabel("Well Functionality Count")
plt.title("Functionality by Basin")
plt.xticks(rotation=90)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/2302165471.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=full_df["basin"], y=full_df["status_group"], palette="Blues_r")
In [ ]:
extraction_type_counts = full_df[full_df["status_group"] == 0]["extraction_type"].value_counts().reset_index()
extraction_type_counts.columns = ["extraction_type", "count"]
plt.figure(figsize=(10, 5))
sns.barplot(x="extraction_type", y="count", data=extraction_type_counts, palette="Reds_r")
plt.xlabel("GPS_Height")
plt.ylabel("Well Functionality Count")
plt.title("Functionality by Extraction Type")
plt.xticks(rotation=90)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/2660479009.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x="extraction_type", y="count", data=extraction_type_counts, palette="Reds_r")
In [ ]:
quality_counts = full_df[full_df["status_group"] == 0]["water_quality"].value_counts().reset_index()
quality_counts.columns = ["water_quality", "count"]
plt.figure(figsize=(10, 6))
sns.barplot(x=full_df["water_quality"], y=full_df["status_group"], palette="Blues_r")
plt.xlabel("Water Quality")
plt.ylabel("Well Functionality Count")
plt.title("Functionality and Water Quality")
plt.xticks(rotation=90)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/3668846308.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=full_df["water_quality"], y=full_df["status_group"], palette="Blues_r")
In [ ]:
high_cardinality_categoricals = ['funder', 'installer', 'scheme_name']
for col in high_cardinality_categoricals:
plt.figure(figsize=(12,5))
full_df[col].value_counts().plot(kind='bar')
plt.title(f"Frequency Distribution for {col}")
plt.xlabel(col)
plt.ylabel("Count")
plt.show()
In [ ]:
for col in high_cardinality_categoricals:
value_counts = full_df[col].value_counts(normalize=True).cumsum()
print(f"\n[col] - Cumulative Distribution:")
print(value_counts.head(20))
[col] - Cumulative Distribution: funder government of tanzania 0.152929 unknown 0.214226 danida 0.266650 hesawa 0.303721 rwssp 0.326852 world bank 0.349562 kkkt 0.371229 world vision 0.392205 unicef 0.410000 tasaf 0.424764 district council 0.438956 dhv 0.452912 private individual 0.466818 dwsp 0.480471 0 0.493552 norad 0.506431 germany republi 0.516700 tcrs 0.526835 ministry of water 0.536768 water 0.546582 Name: proportion, dtype: float64 [col] - Cumulative Distribution: installer dwe 0.293013 unknown 0.354613 government 0.386448 hesawa 0.409933 rwe 0.430236 commu 0.448165 danida 0.465842 district council 0.482020 kkkt 0.497340 0 0.510421 central government 0.523266 tcrs 0.535168 world vision 0.546936 danid 0.557441 ces 0.567710 community 0.577054 gover 0.585657 amref 0.593114 lga 0.600067 tasaf 0.606987 Name: proportion, dtype: float64 [col] - Cumulative Distribution: scheme_name unknown 0.485017 k 0.496549 borehole 0.505741 chalinze wate 0.512576 m 0.519310 danida 0.525690 government 0.531077 bagamoyo wate 0.536061 ngana water supplied scheme 0.540606 wanging'ombe water supply s 0.545000 wanging'ombe supply scheme 0.548973 i 0.552828 uroki-bomang'ombe water sup 0.556347 n 0.559781 kirua kahe gravity water supply trust 0.563030 machumba estate pipe line 0.566145 makwale water supplied sche 0.568939 kijiji 0.571650 mkongoro one 0.574293 s 0.576886 Name: proportion, dtype: float64
In [ ]:
from sklearn.feature_selection import chi2
import numpy as np
for col in high_cardinality_categoricals:
X = pd.get_dummies(full_df[col], drop_first=True) # One-hot encode categories
y = full_df['status_group'] # Target variable
chi_scores, p_values = chi2(X, y)
chi_results = pd.DataFrame({"Category": X.columns, "Chi2": chi_scores, "p-value": p_values})
chi_results = chi_results.sort_values(by="Chi2", ascending=False)
print(f"\n{col} - Top 20 Categories by Chi-Square Importance:")
print(chi_results.head(20)) # Show the most significant categories
funder - Top 20 Categories by Chi-Square Importance:
Category Chi2 p-value
455 government of tanzania 694.831216 1.316189e-151
344 dwsp 508.758234 3.346209e-111
587 isf 388.141343 5.202095e-85
389 fini water 345.385611 1.001077e-75
435 germany republi 285.245585 1.147382e-62
1270 private individual 284.606237 1.579577e-62
1137 norad 281.728518 6.659315e-62
417 fw 213.912100 3.544686e-47
179 ces(gmbh) 183.483701 1.435557e-40
791 lga 170.746450 8.372967e-38
970 ministry of water 167.077460 5.243129e-37
1748 unknown 135.230796 4.315249e-30
854 magadini-makiwaru water 133.047224 1.285767e-29
512 hesawa 130.558801 4.461868e-29
1863 world bank 116.721629 4.510454e-26
1320 rc church 113.898007 1.850810e-25
779 lawatefuka water supply 110.805166 8.688883e-25
395 finw 109.850407 1.400513e-24
1881 wvt 108.743591 2.435731e-24
720 kiuma 102.315911 6.058740e-23
installer - Top 20 Categories by Chi-Square Importance:
Category Chi2 p-value
510 government 498.298974 6.248167e-109
1445 rwe 412.146674 3.187770e-90
449 fini water 346.882206 4.736811e-76
205 central government 336.444184 8.751322e-74
207 ces 285.245585 1.147382e-62
469 fw 213.912100 3.544686e-47
12 acra 212.373661 7.649718e-47
363 dmdd 206.622634 1.356641e-45
855 lga 179.452130 1.077618e-39
309 danid 159.306528 2.552860e-35
1756 unknown 139.454114 5.223048e-31
361 district water department 138.742771 7.454011e-31
917 magadini-makiwaru wa 133.047224 1.285767e-29
907 ma 130.905069 3.752540e-29
1910 wu 129.559501 7.353802e-29
453 finw 126.661928 3.131207e-28
201 centr 124.787592 7.993164e-28
384 dwe 117.982336 2.401382e-26
1387 region water department 114.920225 1.110172e-25
849 lawatefuka water sup 110.805166 8.688883e-25
scheme_name - Top 20 Categories by Chi-Square Importance:
Category Chi2 p-value
1472 mkongoro one 584.017885 1.520994e-127
1474 mkongoro two 356.587750 3.697890e-78
1361 mbati water supply 204.152884 4.664064e-45
1662 ms 171.100009 7.016251e-38
218 chalinze wate 167.235115 4.845696e-37
2054 nyanga/kalege 153.114663 5.643823e-34
11 amani water supply 140.355108 3.328707e-31
1913 ngana water supplied scheme 134.523334 6.146519e-30
2309 sinyanga water supplied sch 130.803988 3.947070e-29
918 kizingu 121.975514 3.261021e-27
2539 wausa 114.105820 1.668153e-25
1732 mtwango water supply scheme 106.841020 6.306194e-24
591 k 104.226676 2.330585e-23
1048 losaa-kia water supply 103.538394 3.287936e-23
849 kirua kahe gravity water supply trust 102.858256 4.619696e-23
2136 pahi 101.138716 1.091458e-22
681 kaseke water project 99.889666 2.038143e-22
2455 unknown 95.973563 1.444128e-21
850 kirua kahe pumping water trust 95.072166 2.266426e-21
228 chankele/bubango water project 94.868322 2.509607e-21
In [ ]:
full_df['installer'] = full_df['installer'].dropna()
In [ ]:
full_df['installer'] =full_df['installer'].str.strip()
In [ ]:
full_df['installer'] = full_df['installer'].str.lower()
full_df['installer'] = full_df['installer'].str.replace('[^a-zA-Z0-9]', '', regex=True)
In [ ]:
full_df['funder'] = full_df['funder'].dropna()
full_df['funder'] = full_df['funder'].str.strip()
full_df['funder'] = full_df['funder'].str.lower()
full_df['funder'] = full_df['funder'].str.replace('[^a-zA-Z0-9]', '', regex=True)
In [ ]:
full_df['scheme_name'] = full_df['scheme_name'].dropna()
full_df['scheme_name'] = full_df['scheme_name'].str.strip()
full_df['scheme_name'] = full_df['scheme_name'].str.lower()
full_df['scheme_name'] = full_df['scheme_name'].str.replace('[^a-zA-Z0-9]', '', regex=True)
In [ ]:
funder_cardinal = full_df['funder']
funder_cardinal.head()
Out[ ]:
0 roman 1 grumeti 2 lotteryclub 3 unicef 4 actionina Name: funder, dtype: object
In [ ]:
sns.countplot(x='funder', hue='status_group', data=full_df)
plt.title("Keep Funder Column?")
plt.show()
In [ ]:
print(funder_cardinal.index.duplicated().sum())
0
In [ ]:
def display_cumulative_percentage(df, column):
value_counts = df[column].value_counts(normalize=True).cumsum()
print(f"\n{column} - Cumulative Distribution:\n")
print(value_counts.head(30))
plt.figure(figsize=(12, 5))
plt.plot(range(1, len(value_counts) + 1), value_counts, marker='o', linestyle='-')
plt.xlabel("Number of Categories")
plt.ylabel("Cumulative Percentage")
plt.title(f"Cumulative Percentage of {column}")
plt.axhline(y=0.8, color='r', linestyle='--', label='80% threshold')
plt.axhline(y=0.9, color='g', linestyle='--', label='90% threshold')
plt.legend()
plt.show()
display_cumulative_percentage(full_df, 'installer')
installer - Cumulative Distribution: installer dwe 0.293401 unknown 0.355000 government 0.386835 hesawa 0.410320 rwe 0.430623 commu 0.448552 danida 0.466229 districtcouncil 0.482475 kkkt 0.497795 0 0.510875 centralgovernment 0.523721 tcrs 0.535623 worldvision 0.547391 danid 0.557896 ces 0.568165 community 0.577508 gover 0.586111 amref 0.593569 tasaf 0.600572 lga 0.607525 wedeco 0.614226 finiwater 0.620774 dmdd 0.627104 norad 0.633266 twesa 0.638586 da 0.643771 wu 0.648838 acra 0.653519 sema 0.657710 dw 0.661886 Name: proportion, dtype: float64
In [ ]:
def display_cumulative_percentage(df, column):
value_counts = df[column].value_counts(normalize=True).cumsum()
print(f"\n{column} - Cumulative Distribution:\n")
print(value_counts.head(30))
plt.figure(figsize=(12, 5))
plt.plot(range(1, len(value_counts) + 1), value_counts, marker='o', linestyle='-')
plt.xlabel("Number of Categories")
plt.ylabel("Cumulative Percentage")
plt.title(f"Cumulative Percentage of {column}")
plt.axhline(y=0.8, color='r', linestyle='--', label='80% threshold')
plt.axhline(y=0.9, color='g', linestyle='--', label='90% threshold')
plt.legend()
plt.show()
display_cumulative_percentage(full_df, 'funder')
funder - Cumulative Distribution: funder governmentoftanzania 0.152929 unknown 0.214226 danida 0.266650 hesawa 0.303721 rwssp 0.326852 worldbank 0.349562 kkkt 0.371229 worldvision 0.392222 unicef 0.410017 tasaf 0.424781 districtcouncil 0.438973 dhv 0.452929 privateindividual 0.466835 dwsp 0.480488 0 0.493569 norad 0.506448 germanyrepubli 0.516717 tcrs 0.526852 ministryofwater 0.536785 water 0.546599 dwe 0.554747 netherlands 0.562660 hifab 0.570236 adb 0.577778 lga 0.585219 cesgmbh 0.592374 amref 0.599529 finiwater 0.606145 oxfam 0.612189 wateraid 0.617795 Name: proportion, dtype: float64
In [ ]:
def display_cumulative_percentage(df, column):
value_counts = df[column].value_counts(normalize=True).cumsum()
print(f"\n{column} - Cumulative Distribution:\n")
print(value_counts.head(30))
plt.figure(figsize=(12, 5))
plt.plot(range(1, len(value_counts) + 1), value_counts, marker='o', linestyle='-')
plt.xlabel("Number of Categories")
plt.ylabel("Cumulative Percentage")
plt.title(f"Cumulative Percentage of {column}")
plt.axhline(y=0.8, color='r', linestyle='--', label='80% threshold')
plt.axhline(y=0.9, color='g', linestyle='--', label='90% threshold')
plt.legend()
plt.show()
display_cumulative_percentage(full_df, 'scheme_name')
scheme_name - Cumulative Distribution: scheme_name unknown 0.485017 k 0.496549 borehole 0.505825 chalinzewate 0.512660 m 0.519394 danida 0.525774 government 0.531162 bagamoyowate 0.536145 nganawatersuppliedscheme 0.540690 wangingombewatersupplys 0.545084 wangingombesupplyscheme 0.549057 i 0.552912 urokibomangombewatersup 0.556515 lyamungoumbwewatersupply 0.560084 n 0.563519 losaakiawatersupply 0.566953 kiruakahegravitywatersupplytrust 0.570202 machumbaestatepipeline 0.573316 makwalewatersuppliedsche 0.576111 kijiji 0.578822 mkongoroone 0.581465 s 0.584057 handenitrunkmainh 0.586616 mtwangowatersupplyscheme 0.589175 mkongorotwo 0.591700 roman 0.594040 upperruvu 0.596279 maambrenigravitywatersupply 0.598384 shallowwell 0.600387 ngamangawatersuppliedsch 0.602340 Name: proportion, dtype: float64
In [ ]:
full_df.drop('scheme_name', axis=1)
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | roman | 1390 | roman | lake nyasa | iringa | 109 | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 |
| 1 | 8776 | 0.0 | 2013-03-06 | grumeti | 1399 | grumeti | lake victoria | mara | 280 | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 |
| 2 | 34310 | 25.0 | 2013-02-25 | lotteryclub | 686 | worldvision | pangani | manyara | 250 | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 |
| 3 | 67743 | 0.0 | 2013-01-28 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 58 | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 |
| 4 | 19728 | 0.0 | 2011-07-13 | actionina | 0 | artisan | lake victoria | kagera | 0 | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 59395 | 60739 | 10.0 | 2013-05-03 | germanyrepubli | 1210 | ces | pangani | kilimanjaro | 125 | 1999 | gravity | gravity | water board | user-group | pay per bucket | soft | enough | spring | communal standpipe | 0 |
| 59396 | 27263 | 4700.0 | 2011-05-07 | cefanjombe | 1212 | cefa | rufiji | iringa | 56 | 1996 | gravity | gravity | vwc | user-group | pay annually | soft | enough | river | communal standpipe | 0 |
| 59397 | 37057 | 0.0 | 2011-04-11 | unknown | 0 | unknown | rufiji | mbeya | 0 | 0 | swn 80 | handpump | vwc | user-group | pay monthly | fluoride | enough | machine dbh | hand pump | 0 |
| 59398 | 31282 | 0.0 | 2011-03-08 | malec | 0 | musa | rufiji | dodoma | 0 | 0 | nira/tanira | handpump | vwc | user-group | never pay | soft | insufficient | shallow well | hand pump | 0 |
| 59399 | 26348 | 0.0 | 2011-03-23 | worldbank | 191 | world | wami / ruvu | morogoro | 150 | 2002 | nira/tanira | handpump | vwc | user-group | pay when scheme fails | salty | enough | shallow well | hand pump | 0 |
59400 rows × 20 columns
In [ ]:
sns.countplot(x='population', hue='status_group', data=full_df)
plt.title("Keep Population Column?")
plt.show()
In [ ]:
print(full_df.groupby('population')['status_group'].value_counts(normalize=True))
population status_group
0 0 0.527291
2 0.389692
1 0.083018
1 2 0.535231
0 0.414377
...
9865 2 1.000000
10000 0 1.000000
11463 0 1.000000
15300 1 1.000000
30500 0 1.000000
Name: proportion, Length: 1939, dtype: float64
In [ ]:
full_df['date_recorded'] = pd.to_datetime(full_df['date_recorded'])
full_df['year_recorded'] = full_df['date_recorded'].dt.year
full_df.head()
Out[ ]:
| id | amount_tsh | date_recorded | funder | gps_height | installer | basin | region | population | scheme_name | ... | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 69572 | 6000.0 | 2011-03-14 | roman | 1390 | roman | lake nyasa | iringa | 109 | roman | ... | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 8776 | 0.0 | 2013-03-06 | grumeti | 1399 | grumeti | lake victoria | mara | 280 | unknown | ... | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 34310 | 25.0 | 2013-02-25 | lotteryclub | 686 | worldvision | pangani | manyara | 250 | nyumbayamungupipescheme | ... | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 67743 | 0.0 | 2013-01-28 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 58 | unknown | ... | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 19728 | 0.0 | 2011-07-13 | actionina | 0 | artisan | lake victoria | kagera | 0 | unknown | ... | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
5 rows × 22 columns
In [ ]:
full_df = full_df.drop(['id', 'date_recorded'], axis=1)
full_df.head()
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | population | scheme_name | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 109 | roman | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 0.0 | grumeti | 1399 | grumeti | lake victoria | mara | 280 | unknown | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 25.0 | lotteryclub | 686 | worldvision | pangani | manyara | 250 | nyumbayamungupipescheme | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 58 | unknown | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 0.0 | actionina | 0 | artisan | lake victoria | kagera | 0 | unknown | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
In [ ]:
sns.pairplot(full_df)
plt.show()
In [ ]:
backup_df = full_df
In [ ]:
sns.boxplot(x='status_group', y='construction_year', data=full_df)
plt.show()
In [ ]:
full_df['scheme_name'].nunique()
Out[ ]:
2482
In [ ]:
full_df['scheme_name'].value_counts()
Out[ ]:
scheme_name
unknown 28810
k 685
borehole 551
chalinzewate 406
m 400
...
mshinde 1
wd 1
nguj 1
tasafdam 1
mtawanya 1
Name: count, Length: 2482, dtype: int64
In [ ]:
full_df = full_df.drop('scheme_name', axis=1)
full_df.head()
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | population | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 109 | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 0.0 | grumeti | 1399 | grumeti | lake victoria | mara | 280 | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 25.0 | lotteryclub | 686 | worldvision | pangani | manyara | 250 | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 58 | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 0.0 | actionina | 0 | artisan | lake victoria | kagera | 0 | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
In [ ]:
def reduce_cardinality(full_df, installer, top_percent=5):
# Get value counts as a percentage
value_counts = full_df['installer'].value_counts(normalize=True)
# Get cumulative percentage
cumulative = value_counts.cumsum()
# Get the threshold for top X% categories
num_top = int(len(value_counts) * (top_percent / 100))
# Select top categories
top_categories = value_counts.iloc[:num_top].index
# Replace infrequent categories with 'Other'
full_df['installer'] = full_df[installer].apply(lambda x: x if x in top_categories else 'Other')
return full_df
# Apply function to both columns
full_df = reduce_cardinality(full_df, 'installer', top_percent=5)
# Check results
full_df['installer'].value_counts()
Out[ ]:
installer
dwe 17428
Other 10628
unknown 3659
government 1891
hesawa 1395
...
roman 94
oikoseafrica 93
vwc 91
cefa 90
missi 87
Name: count, Length: 93, dtype: int64
In [ ]:
def reduce_cardinality(full_df, funder, top_percent=5):
# Get value counts as a percentage
value_counts = full_df['funder'].value_counts(normalize=True)
# Get cumulative percentage
cumulative = value_counts.cumsum()
# Get the threshold for top X% categories
num_top = int(len(value_counts) * (top_percent / 100))
# Select top categories
top_categories = value_counts.iloc[:num_top].index
# Replace infrequent categories with 'Other'
full_df['funder'] = full_df[funder].apply(lambda x: x if x in top_categories else 'Other')
return full_df
# Apply function to both columns
full_df = reduce_cardinality(full_df, 'funder', top_percent=5)
# Check results
full_df['funder'].value_counts()
Out[ ]:
funder
Other 12170
governmentoftanzania 9084
unknown 3641
danida 3114
hesawa 2202
...
halmashauriyawilayasikonge 102
germany 101
hsw 101
finwater 101
twe 97
Name: count, Length: 94, dtype: int64
In [ ]:
full_df = full_df.drop('population', axis=1)
full_df.head()
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 0.0 | Other | 1399 | Other | lake victoria | mara | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 25.0 | Other | 686 | worldvision | pangani | manyara | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 0.0 | Other | 0 | artisan | lake victoria | kagera | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
In [ ]:
backup_df.head()
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | population | scheme_name | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 109 | roman | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 0.0 | grumeti | 1399 | grumeti | lake victoria | mara | 280 | unknown | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 25.0 | lotteryclub | 686 | worldvision | pangani | manyara | 250 | nyumbayamungupipescheme | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 58 | unknown | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 0.0 | actionina | 0 | artisan | lake victoria | kagera | 0 | unknown | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
In [ ]:
full_df = backup_df
full_df.head()
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | population | scheme_name | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 109 | roman | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 0.0 | grumeti | 1399 | grumeti | lake victoria | mara | 280 | unknown | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 25.0 | lotteryclub | 686 | worldvision | pangani | manyara | 250 | nyumbayamungupipescheme | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 58 | unknown | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 0.0 | actionina | 0 | artisan | lake victoria | kagera | 0 | unknown | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
In [ ]:
full_df = full_df.drop(['scheme_name', 'population'], axis=1)
full_df.head()
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 0.0 | grumeti | 1399 | grumeti | lake victoria | mara | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 25.0 | lotteryclub | 686 | worldvision | pangani | manyara | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 0.0 | actionina | 0 | artisan | lake victoria | kagera | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
In [ ]:
def reduce_cardinality(full_df, installer, top_percent=5):
# Get value counts as a percentage
value_counts = full_df['installer'].value_counts(normalize=True)
# Get cumulative percentage
cumulative = value_counts.cumsum()
# Get the threshold for top X% categories
num_top = int(len(value_counts) * (top_percent / 100))
# Select top categories
top_categories = value_counts.iloc[:num_top].index
# Replace infrequent categories with 'Other'
full_df['installer'] = full_df[installer].apply(lambda x: x if x in top_categories else 'Other')
return full_df
# Apply function to both columns
full_df = reduce_cardinality(full_df, 'installer', top_percent=5)
# Check results
full_df['installer'].value_counts()
Out[ ]:
installer
dwe 17428
Other 10628
unknown 3659
government 1891
hesawa 1395
...
roman 94
oikoseafrica 93
vwc 91
cefa 90
missi 87
Name: count, Length: 93, dtype: int64
In [ ]:
def reduce_cardinality(full_df, funder, top_percent=5):
# Get value counts as a percentage
value_counts = full_df['funder'].value_counts(normalize=True)
# Get cumulative percentage
cumulative = value_counts.cumsum()
# Get the threshold for top X% categories
num_top = int(len(value_counts) * (top_percent / 100))
# Select top categories
top_categories = value_counts.iloc[:num_top].index
# Replace infrequent categories with 'Other'
full_df['funder'] = full_df[funder].apply(lambda x: x if x in top_categories else 'Other')
return full_df
# Apply function to both columns
full_df = reduce_cardinality(full_df, 'funder', top_percent=5)
# Check results
full_df['funder'].value_counts()
Out[ ]:
funder
Other 12170
governmentoftanzania 9084
unknown 3641
danida 3114
hesawa 2202
...
halmashauriyawilayasikonge 102
germany 101
hsw 101
finwater 101
twe 97
Name: count, Length: 94, dtype: int64
In [ ]:
full_df.head()
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | status_group | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6000.0 | roman | 1390 | roman | lake nyasa | iringa | 1999 | gravity | gravity | vwc | user-group | pay annually | soft | enough | spring | communal standpipe | 0 | 2011 |
| 1 | 0.0 | Other | 1399 | Other | lake victoria | mara | 2010 | gravity | gravity | wug | user-group | never pay | soft | insufficient | rainwater harvesting | communal standpipe | 0 | 2013 |
| 2 | 25.0 | Other | 686 | worldvision | pangani | manyara | 2009 | gravity | gravity | vwc | user-group | pay per bucket | soft | enough | dam | communal standpipe multiple | 0 | 2013 |
| 3 | 0.0 | unicef | 263 | unicef | ruvuma / southern coast | mtwara | 1986 | submersible | submersible | vwc | user-group | never pay | soft | dry | machine dbh | communal standpipe multiple | 2 | 2013 |
| 4 | 0.0 | Other | 0 | artisan | lake victoria | kagera | 0 | gravity | gravity | other | other | never pay | soft | seasonal | rainwater harvesting | communal standpipe | 0 | 2011 |
In [ ]:
print(full_df.isnull().sum().sum()) # Should be 0 or low
0
In [ ]:
full_df['basin'] = full_df['basin'].str.strip()
full_df['basin'] = full_df['basin'].str.lower()
full_df['basin'] = full_df['basin'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['region'] = full_df['region'].str.strip()
full_df['region'] = full_df['region'].str.lower()
full_df['region'] = full_df['region'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['extraction_type'] = full_df['extraction_type'].str.strip()
full_df['extraction_type'] = full_df['extraction_type'].str.lower()
full_df['extraction_type'] = full_df['extraction_type'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['management'] = full_df['management'].str.strip()
full_df['management'] = full_df['management'].str.lower()
full_df['management'] = full_df['management'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['extraction_type_class'] = full_df['extraction_type_class'].str.strip()
full_df['extraction_type_class'] = full_df['extraction_type_class'].str.lower()
full_df['extraction_type_class'] = full_df['extraction_type_class'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['management_group'] = full_df['management_group'].str.strip()
full_df['management_group'] = full_df['management_group'].str.lower()
full_df['management_group'] = full_df['management_group'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['payment'] = full_df['payment'].str.strip()
full_df['payment'] = full_df['payment'].str.lower()
full_df['payment'] = full_df['payment'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['water_quality'] = full_df['water_quality'].str.strip()
full_df['water_quality'] = full_df['water_quality'].str.lower()
full_df['water_quality'] = full_df['water_quality'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['quantity'] = full_df['quantity'].str.strip()
full_df['quantity'] = full_df['quantity'].str.lower()
full_df['quantity'] = full_df['quantity'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['source'] = full_df['source'].str.strip()
full_df['source'] = full_df['source'].str.lower()
full_df['source'] = full_df['source'].str.replace('[^a-zA-Z0-9]', '', regex=True)
full_df['waterpoint_type'] = full_df['waterpoint_type'].str.strip()
full_df['waterpoint_type'] = full_df['waterpoint_type'].str.lower()
full_df['waterpoint_type'] = full_df['waterpoint_type'].str.replace('[^a-zA-Z0-9]', '', regex=True)
In [ ]:
X = full_df.drop(columns=['status_group'])
y = full_df['status_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print(X_train, X_test, y_train, y_test)
amount_tsh funder gps_height installer \
3607 50.0 dmdd 2092 dmdd
50870 0.0 Other 0 gove
20413 0.0 kkkt 0 kkkt
52806 0.0 unknown 0 unknown
50091 300.0 ki 1023 ki
... ... ... ... ...
54343 1000.0 dhv 327 dwe
38158 1000.0 danida 1743 danid
860 0.0 Other -13 Other
15795 0.0 rwssp 0 wedeco
56422 0.0 governmentoftanzania 1290 districtcouncil
basin region construction_year extraction_type \
3607 internal manyara 1998 gravity
50870 internal dodoma 0 indiamarkii
20413 lakerukwa mbeya 0 other
52806 rufiji mbeya 0 gravity
50091 wamiruvu morogoro 1997 other
... ... ... ... ...
54343 rufiji morogoro 1995 gravity
38158 lakenyasa iringa 1988 gravity
860 pangani tanga 2005 other
15795 lakevictoria shinyanga 0 niratanira
56422 pangani kilimanjaro 1976 gravity
extraction_type_class management management_group \
3607 gravity waterboard usergroup
50870 handpump vwc usergroup
20413 other vwc usergroup
52806 gravity vwc usergroup
50091 other vwc usergroup
... ... ... ...
54343 gravity wug usergroup
38158 gravity vwc usergroup
860 other privateoperator commercial
15795 handpump wug usergroup
56422 gravity company commercial
payment water_quality quantity source \
3607 payperbucket soft insufficient spring
50870 neverpay soft enough shallowwell
20413 neverpay soft enough shallowwell
52806 paymonthly soft insufficient river
50091 paywhenschemefails salty enough shallowwell
... ... ... ... ...
54343 paymonthly soft enough river
38158 payannually soft seasonal spring
860 neverpay soft insufficient shallowwell
15795 neverpay soft enough shallowwell
56422 paymonthly soft enough spring
waterpoint_type year_recorded
3607 communalstandpipe 2013
50870 handpump 2011
20413 other 2011
52806 communalstandpipe 2011
50091 other 2011
... ... ...
54343 communalstandpipe 2011
38158 communalstandpipe 2011
860 other 2011
15795 handpump 2012
56422 communalstandpipemultiple 2013
[47520 rows x 17 columns] amount_tsh funder gps_height installer \
2980 0.0 ruralwatersupplyandsanitat 0 dwe
5246 0.0 Other 0 Other
22659 10.0 Other 1675 dwe
39888 0.0 kkkt 0 kkkt
13361 50.0 wateraid 1109 sema
... ... ... ... ...
17841 200.0 anglicanchurch 1775 angli
20222 500.0 governmentoftanzania 1850 dwe
40219 0.0 danida 0 dwe
19599 0.0 Other 642 Other
41264 0.0 governmentoftanzania 1430 dwe
basin region construction_year extraction_type \
2980 laketanganyika shinyanga 0 other
5246 laketanganyika tabora 0 indiamarkii
22659 internal manyara 2008 gravity
39888 lakerukwa mbeya 0 niratanira
13361 internal singida 2011 mono
... ... ... ... ...
17841 lakenyasa iringa 2001 gravity
20222 pangani arusha 1990 gravity
40219 lakevictoria kagera 0 gravity
19599 wamiruvu morogoro 2004 gravity
41264 pangani tanga 2007 gravity
extraction_type_class management management_group payment \
2980 other wug usergroup unknown
5246 handpump vwc usergroup neverpay
22659 gravity waterboard usergroup payperbucket
39888 handpump vwc usergroup neverpay
13361 motorpump wua usergroup payperbucket
... ... ... ... ...
17841 gravity vwc usergroup paymonthly
20222 gravity vwc usergroup paywhenschemefails
40219 gravity vwc usergroup neverpay
19599 gravity vwc usergroup neverpay
41264 gravity vwc usergroup neverpay
water_quality quantity source waterpoint_type \
2980 soft dry shallowwell other
5246 milky insufficient shallowwell handpump
22659 soft insufficient spring communalstandpipe
39888 soft seasonal shallowwell handpump
13361 soft enough machinedbh communalstandpipemultiple
... ... ... ... ...
17841 soft enough spring communalstandpipe
20222 soft enough spring other
40219 soft dry spring communalstandpipe
19599 soft insufficient spring communalstandpipe
41264 soft enough spring communalstandpipe
year_recorded
2980 2012
5246 2012
22659 2013
39888 2011
13361 2013
... ...
17841 2011
20222 2013
40219 2011
19599 2011
41264 2011
[11880 rows x 17 columns] 3607 0
50870 0
20413 2
52806 2
50091 2
..
54343 0
38158 0
860 2
15795 0
56422 2
Name: status_group, Length: 47520, dtype: int8 2980 2
5246 0
22659 0
39888 2
13361 0
..
17841 0
20222 0
40219 2
19599 0
41264 0
Name: status_group, Length: 11880, dtype: int8
In [ ]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(47520, 17) (11880, 17) (47520,) (11880,)
In [ ]:
from scipy.stats import ttest_ind
In [ ]:
# Identify original categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['number']).columns.tolist()
print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)
Categorical Columns: ['funder', 'installer', 'basin', 'region', 'extraction_type', 'extraction_type_class', 'management', 'management_group', 'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type'] Numerical Columns: ['amount_tsh', 'gps_height', 'construction_year', 'year_recorded']
In [ ]:
gps_height_means = X_train.groupby(['region', 'basin'])['gps_height'].mean().reset_index()
gps_height_means
Out[ ]:
| region | basin | gps_height | |
|---|---|---|---|
| 0 | arusha | internal | 1480.300095 |
| 1 | arusha | lakevictoria | 1935.375000 |
| 2 | arusha | pangani | 1337.661300 |
| 3 | daressalaam | wamiruvu | 31.668196 |
| 4 | dodoma | internal | 0.000000 |
| 5 | dodoma | rufiji | 0.000000 |
| 6 | dodoma | wamiruvu | 0.000000 |
| 7 | iringa | lakenyasa | 1876.021260 |
| 8 | iringa | rufiji | 1620.553619 |
| 9 | kagera | laketanganyika | 0.000000 |
| 10 | kagera | lakevictoria | 0.000000 |
| 11 | kigoma | laketanganyika | 1234.369845 |
| 12 | kilimanjaro | internal | 1551.742424 |
| 13 | kilimanjaro | pangani | 1164.475405 |
| 14 | lindi | rufiji | 152.656250 |
| 15 | lindi | ruvumasoutherncoast | 214.011804 |
| 16 | manyara | internal | 1539.252838 |
| 17 | manyara | pangani | 973.857143 |
| 18 | manyara | wamiruvu | 1346.363636 |
| 19 | mara | lakevictoria | 1341.274214 |
| 20 | mbeya | lakenyasa | 0.000000 |
| 21 | mbeya | lakerukwa | 0.000000 |
| 22 | mbeya | rufiji | 0.000000 |
| 23 | morogoro | rufiji | 331.345262 |
| 24 | morogoro | wamiruvu | 420.571762 |
| 25 | mtwara | ruvumasoutherncoast | 257.667394 |
| 26 | mwanza | laketanganyika | 0.000000 |
| 27 | mwanza | lakevictoria | 150.095953 |
| 28 | pwani | rufiji | 30.629108 |
| 29 | pwani | wamiruvu | 104.153794 |
| 30 | rukwa | lakerukwa | 1421.928659 |
| 31 | rukwa | laketanganyika | 1386.082683 |
| 32 | ruvuma | lakenyasa | 1004.507009 |
| 33 | ruvuma | rufiji | 860.492823 |
| 34 | ruvuma | ruvumasoutherncoast | 819.211538 |
| 35 | shinyanga | internal | 0.000000 |
| 36 | shinyanga | laketanganyika | 0.000000 |
| 37 | shinyanga | lakevictoria | 101.463576 |
| 38 | singida | internal | 1424.201643 |
| 39 | singida | lakerukwa | 1224.000000 |
| 40 | singida | laketanganyika | 1398.125000 |
| 41 | singida | rufiji | 1262.712329 |
| 42 | tabora | internal | 0.000000 |
| 43 | tabora | lakerukwa | 0.000000 |
| 44 | tabora | laketanganyika | 0.000000 |
| 45 | tabora | rufiji | 0.000000 |
| 46 | tanga | pangani | 567.867550 |
| 47 | tanga | wamiruvu | 17.752809 |
In [ ]:
X_train = X_train.merge(gps_height_means, on=['region', 'basin'], suffixes=('', '_mean'))
In [ ]:
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | gps_height_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 1539.252838 |
| 1 | 0.0 | Other | 0 | gove | internal | dodoma | 0 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | 0.000000 |
| 2 | 0.0 | kkkt | 0 | kkkt | lakerukwa | mbeya | 0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 0.000000 |
| 3 | 0.0 | unknown | 0 | unknown | rufiji | mbeya | 0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 0.000000 |
| 4 | 300.0 | ki | 1023 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 420.571762 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 331.345262 |
| 47516 | 1000.0 | danida | 1743 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1876.021260 |
| 47517 | 0.0 | Other | -13 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 567.867550 |
| 47518 | 0.0 | rwssp | 0 | wedeco | lakevictoria | shinyanga | 0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 101.463576 |
| 47519 | 0.0 | governmentoftanzania | 1290 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1164.475405 |
47520 rows × 18 columns
In [ ]:
basin_means = X_train[X_train['gps_height'] > 0].groupby('basin')['gps_height'].mean().reset_index()
basin_means
Out[ ]:
| basin | gps_height | |
|---|---|---|
| 0 | internal | 1474.360011 |
| 1 | lakenyasa | 1525.119944 |
| 2 | lakerukwa | 1421.685504 |
| 3 | laketanganyika | 1268.308540 |
| 4 | lakevictoria | 1328.149756 |
| 5 | pangani | 1099.034678 |
| 6 | rufiji | 1088.509645 |
| 7 | ruvumasoutherncoast | 450.121398 |
| 8 | wamiruvu | 306.012319 |
In [ ]:
X_train = X_train.merge(basin_means, on='basin', suffixes=('', '_basin_mean'))
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | gps_height_mean | gps_height_basin_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 1539.252838 | 1474.360011 |
| 1 | 0.0 | Other | 0 | gove | internal | dodoma | 0 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | 0.000000 | 1474.360011 |
| 2 | 0.0 | kkkt | 0 | kkkt | lakerukwa | mbeya | 0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 0.000000 | 1421.685504 |
| 3 | 0.0 | unknown | 0 | unknown | rufiji | mbeya | 0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 0.000000 | 1088.509645 |
| 4 | 300.0 | ki | 1023 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 420.571762 | 306.012319 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 331.345262 | 1088.509645 |
| 47516 | 1000.0 | danida | 1743 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1876.021260 | 1525.119944 |
| 47517 | 0.0 | Other | -13 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 567.867550 | 1099.034678 |
| 47518 | 0.0 | rwssp | 0 | wedeco | lakevictoria | shinyanga | 0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 101.463576 | 1328.149756 |
| 47519 | 0.0 | governmentoftanzania | 1290 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1164.475405 | 1099.034678 |
47520 rows × 19 columns
In [ ]:
X_train['gps_height'] = np.where(X_train['gps_height'] == 0, X_train['gps_height_mean'], X_train['gps_height'])
X_train['gps_height'] = np.where(X_train['gps_height'] == 0, X_train['gps_height_basin_mean'], X_train['gps_height'])
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | gps_height_mean | gps_height_basin_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 1539.252838 | 1474.360011 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | 0 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | 0.000000 | 1474.360011 |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 0.000000 | 1421.685504 |
| 3 | 0.0 | unknown | 1088.509645 | unknown | rufiji | mbeya | 0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 0.000000 | 1088.509645 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 420.571762 | 306.012319 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 331.345262 | 1088.509645 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1876.021260 | 1525.119944 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 567.867550 | 1099.034678 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 101.463576 | 1328.149756 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1164.475405 | 1099.034678 |
47520 rows × 19 columns
In [ ]:
X_train.drop(columns=['gps_height_mean'], inplace=True)
X_train.drop(columns=['gps_height_basin_mean'], inplace=True)
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | 0 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 |
| 3 | 0.0 | unknown | 1088.509645 | unknown | rufiji | mbeya | 0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 |
47520 rows × 17 columns
In [ ]:
X_test = X_test.merge(gps_height_means, on=['region', 'basin'], how='left', suffixes=('', '_mean'))
X_test['gps_height'] = np.where(X_test['gps_height'] == 0, X_test['gps_height_mean'], X_test['gps_height'])
X_test.drop(columns=['gps_height_mean'], inplace=True)
X_test = X_test.merge(basin_means, on='basin', how='left', suffixes=('', '_basin_mean'))
X_test['gps_height'] = np.where(X_test['gps_height'] == 0, X_test['gps_height_basin_mean'], X_test['gps_height'])
X_test.drop(columns=['gps_height_basin_mean'], inplace=True)
In [ ]:
print("Remaining zeroes in gps_height (train):", (X_train['gps_height'] == 0).sum())
print("Remaining zeroes in gps_height (test):", (X_test['gps_height'] == 0).sum())
Remaining zeroes in gps_height (train): 0 Remaining zeroes in gps_height (test): 0
In [ ]:
unknown_installer_train = X_train['installer'].isin(["", "unknown"]).sum()
unknown_installer_test = X_test['installer'].isin(["", "unknown"]).sum()
print(f"Unknown installers in Train: {unknown_installer_train}")
print(f"Unknown installers in Test: {unknown_installer_test}")
Unknown installers in Train: 2892 Unknown installers in Test: 767
In [ ]:
installer_mode_region = (
X_train[~X_train['installer'].isin(["", "unknown"])]
.groupby('region')['installer']
.agg(lambda x: x.mode()[0])
.reset_index()
)
installer_mode_region
Out[ ]:
| region | installer | |
|---|---|---|
| 0 | arusha | dwe |
| 1 | daressalaam | 0 |
| 2 | dodoma | Other |
| 3 | iringa | dwe |
| 4 | kagera | dwe |
| 5 | kigoma | dwe |
| 6 | kilimanjaro | dwe |
| 7 | lindi | Other |
| 8 | manyara | dwe |
| 9 | mara | Other |
| 10 | mbeya | kkkt |
| 11 | morogoro | dwe |
| 12 | mtwara | lga |
| 13 | mwanza | dwe |
| 14 | pwani | Other |
| 15 | rukwa | dwe |
| 16 | ruvuma | dwe |
| 17 | shinyanga | dwe |
| 18 | singida | Other |
| 19 | tabora | Other |
| 20 | tanga | dwe |
In [ ]:
X_train = X_train.merge(installer_mode_region, on='region', suffixes=('','_region_mode'))
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | installer_region_mode | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | dwe |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | 0 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | Other |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | kkkt |
| 3 | 0.0 | unknown | 1088.509645 | unknown | rufiji | mbeya | 0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | kkkt |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | dwe |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | dwe |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | dwe |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | dwe |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | dwe |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | dwe |
47520 rows × 18 columns
In [ ]:
X_train['installer'] = np.where(X_train['installer'].isin(["", "unknown"]),
X_train['installer_region_mode'],
X_train['installer'])
In [ ]:
X_train.drop(columns=['installer_region_mode'], inplace=True)
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | 0 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | 0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 |
47520 rows × 17 columns
In [ ]:
X_test = X_test.merge(installer_mode_region, on='region', how='left', suffixes=('', '_region_mode'))
In [ ]:
X_test['installer'] = np.where(X_test['installer'].isin(["", "unknown"]),
X_test['installer_region_mode'],
X_test['installer'])
In [ ]:
X_test.drop(columns=['installer_region_mode'], inplace=True)
In [ ]:
print("Remaining 'unknown' or missing values in installer (train):", X_train['installer'].isin(["", "unknown"]).sum())
print("Remaining 'unknown' or missing values in installer (test):", X_test['installer'].isin(["", "unknown"]).sum())
Remaining 'unknown' or missing values in installer (train): 0 Remaining 'unknown' or missing values in installer (test): 0
In [ ]:
construction_median_by_group = (
X_train[X_train['construction_year'] > 0]
.groupby(['region', 'installer'])['construction_year']
.median()
.reset_index()
)
construction_median_by_group
Out[ ]:
| region | installer | construction_year | |
|---|---|---|---|
| 0 | arusha | Other | 2006.0 |
| 1 | arusha | adracommunity | 2002.0 |
| 2 | arusha | community | 2007.0 |
| 3 | arusha | dmdd | 2002.0 |
| 4 | arusha | dwe | 2000.0 |
| ... | ... | ... | ... |
| 247 | tanga | tasaf | 2001.5 |
| 248 | tanga | twe | 2008.5 |
| 249 | tanga | villagers | 2006.5 |
| 250 | tanga | vwc | 2009.0 |
| 251 | tanga | worldvision | 2002.0 |
252 rows × 3 columns
In [ ]:
X_train = X_train.merge(construction_median_by_group, on=['region', 'installer'], how='left', suffixes=('', '_median'))
In [ ]:
X_train['construction_year'] = np.where(X_train['construction_year'] == 0,
X_train['construction_year_median'],
X_train['construction_year'])
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | construction_year_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998.0 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 2003.0 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | NaN | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | NaN |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | NaN | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | NaN |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | NaN | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | NaN |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997.0 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 2004.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995.0 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 1996.0 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988.0 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1988.0 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005.0 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 2000.5 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | NaN | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | NaN |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976.0 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1994.0 |
47520 rows × 18 columns
In [ ]:
X_train.drop(columns=['construction_year_median'], inplace=True)
In [ ]:
installer_median = (
X_train[X_train['construction_year'] > 0]
.groupby('installer')['construction_year']
.median()
.reset_index()
)
installer_median
Out[ ]:
| installer | construction_year | |
|---|---|---|
| 0 | 0 | 2000.0 |
| 1 | Other | 2005.0 |
| 2 | acra | 2009.0 |
| 3 | adra | 2006.0 |
| 4 | adracommunity | 2002.0 |
| ... | ... | ... |
| 79 | world | 2000.0 |
| 80 | worldbank | 2010.0 |
| 81 | worldvision | 2003.0 |
| 82 | wu | 2008.0 |
| 83 | wvt | 2007.0 |
84 rows × 2 columns
In [ ]:
X_train = X_train.merge(installer_median, on='installer', how='left', suffixes=('', '_installer_median'))
In [ ]:
X_train['construction_year'] = np.where(X_train['construction_year'] == 0,
X_train['construction_year_installer_median'],
X_train['construction_year'])
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | construction_year_installer_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998.0 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 2003.0 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | NaN | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | NaN |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | NaN | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 2000.0 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | NaN | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 2000.0 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997.0 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 2004.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995.0 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 1998.0 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988.0 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1988.0 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005.0 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 2005.0 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | NaN | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 2002.0 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976.0 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1998.0 |
47520 rows × 18 columns
In [ ]:
X_train['construction_year'] = np.where(X_train['construction_year'] == 0 | (X_train['construction_year'].isna()),
X_train['construction_year_installer_median'],
X_train['construction_year'])
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | construction_year_installer_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998.0 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 2003.0 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | NaN | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | NaN |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | NaN | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 2000.0 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | NaN | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 2000.0 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997.0 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 2004.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995.0 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 1998.0 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988.0 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1988.0 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005.0 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 2005.0 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | NaN | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 2002.0 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976.0 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1998.0 |
47520 rows × 18 columns
In [ ]:
overall_median = X_train[X_train['construction_year'] > 0]['construction_year'].median()
installer_median['construction_year'] = installer_median['construction_year'].fillna(overall_median)
In [ ]:
X_train = X_train.merge(installer_median, on='installer', how='left', suffixes=('', '_installer_median'))
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | construction_year_installer_median | construction_year_installer_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998.0 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 2003.0 | 2003.0 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | NaN | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | NaN | NaN |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | NaN | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 2000.0 | 2000.0 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | NaN | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 2000.0 | 2000.0 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997.0 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 2004.0 | 2004.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995.0 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 1998.0 | 1998.0 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988.0 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1988.0 | 1988.0 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005.0 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 2005.0 | 2005.0 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | NaN | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 2002.0 | 2002.0 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976.0 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1998.0 | 1998.0 |
47520 rows × 19 columns
In [ ]:
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
In [ ]:
X_train['construction_year'] = np.where(
(X_train['construction_year'] == 0) | (X_train['construction_year'].isna()),
X_train['construction_year_installer_median'],
X_train['construction_year'])
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/2278067437.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train['construction_year'] = np.where(
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | construction_year_installer_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998.0 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 2003.0 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | NaN | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | NaN |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 2000.0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 2000.0 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | 2000.0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 2000.0 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997.0 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 2004.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995.0 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 1998.0 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988.0 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 1988.0 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005.0 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 2005.0 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 2002.0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 2002.0 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976.0 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 1998.0 |
47520 rows × 18 columns
In [ ]:
X_train.drop(columns=['construction_year_installer_median'], inplace=True)
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/4246269074.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train.drop(columns=['construction_year_installer_median'], inplace=True)
In [ ]:
construction_median = X_train['construction_year'].median()
In [ ]:
X_train['construction_year'].fillna(construction_median, inplace=True)
X_train
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/1524966040.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
X_train['construction_year'].fillna(construction_median, inplace=True)
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/1524966040.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train['construction_year'].fillna(construction_median, inplace=True)
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998.0 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | 1999.0 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 2000.0 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | 2000.0 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997.0 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995.0 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988.0 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005.0 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 2002.0 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976.0 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 |
47520 rows × 17 columns
In [ ]:
X_train['construction_year'] = np.ceil(X_train['construction_year']).astype(int)
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/1506013917.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train['construction_year'] = np.ceil(X_train['construction_year']).astype(int)
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | 1999 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 2000 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | 2000 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 2002 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 |
47520 rows × 17 columns
In [ ]:
X_test = X_test.merge(construction_median_by_group, on=['region', 'installer'], how='left', suffixes=('', '_median'))
In [ ]:
X_test['construction_year'] = np.where(
(X_test['construction_year'] == 0) | (X_test['construction_year'].isna()),
X_test['construction_year_median'],
X_test['construction_year']
)
In [ ]:
X_test.drop(columns=['construction_year_median'], inplace=True)
In [ ]:
X_test = X_test.merge(installer_median, on='installer', how='left', suffixes=('', '_installer_median'))
In [ ]:
X_test['construction_year'] = np.where(
(X_test['construction_year'] == 0) | (X_test['construction_year'].isna()),
X_test['construction_year_installer_median'],
X_test['construction_year'])
In [ ]:
X_test.drop(columns=['construction_year_installer_median'], inplace=True)
In [ ]:
X_test['construction_year'].fillna(overall_median, inplace=True)
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/1382956376.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
X_test['construction_year'].fillna(overall_median, inplace=True)
In [ ]:
X_test['construction_year'] = np.ceil(X_test['construction_year']).astype(int)
In [ ]:
print("Remaining NaNs in construction_year (training):", X_train['construction_year'].isna().sum())
print("Remaining NaNs in construction_year (test):", X_test['construction_year'].isna().sum())
Remaining NaNs in construction_year (training): 0 Remaining NaNs in construction_year (test): 0
In [ ]:
X_train['function_years'] = X_train['year_recorded'] - X_train['construction_year']
X_test['function_years'] = X_test['year_recorded'] - X_test['construction_year']
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/3602151837.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train['function_years'] = X_train['year_recorded'] - X_train['construction_year']
In [ ]:
print(X_train[['year_recorded', 'construction_year', 'function_years']].head())
print(X_test[['year_recorded', 'construction_year', 'function_years']].head())
year_recorded construction_year function_years 0 2013 1998 15 1 2011 1999 12 2 2011 2000 11 3 2011 2000 11 4 2011 1997 14 year_recorded construction_year function_years 0 2012 1998 14 1 2012 2005 7 2 2013 2008 5 3 2011 2000 11 4 2013 2011 2
In [ ]:
X_train
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | function_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50.0 | dmdd | 2092.000000 | dmdd | internal | manyara | 1998 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 15 |
| 1 | 0.0 | Other | 1474.360011 | gove | internal | dodoma | 1999 | indiamarkii | handpump | vwc | usergroup | neverpay | soft | enough | shallowwell | handpump | 2011 | 12 |
| 2 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 2000 | other | other | vwc | usergroup | neverpay | soft | enough | shallowwell | other | 2011 | 11 |
| 3 | 0.0 | unknown | 1088.509645 | kkkt | rufiji | mbeya | 2000 | gravity | gravity | vwc | usergroup | paymonthly | soft | insufficient | river | communalstandpipe | 2011 | 11 |
| 4 | 300.0 | ki | 1023.000000 | ki | wamiruvu | morogoro | 1997 | other | other | vwc | usergroup | paywhenschemefails | salty | enough | shallowwell | other | 2011 | 14 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47515 | 1000.0 | dhv | 327.000000 | dwe | rufiji | morogoro | 1995 | gravity | gravity | wug | usergroup | paymonthly | soft | enough | river | communalstandpipe | 2011 | 16 |
| 47516 | 1000.0 | danida | 1743.000000 | danid | lakenyasa | iringa | 1988 | gravity | gravity | vwc | usergroup | payannually | soft | seasonal | spring | communalstandpipe | 2011 | 23 |
| 47517 | 0.0 | Other | -13.000000 | Other | pangani | tanga | 2005 | other | other | privateoperator | commercial | neverpay | soft | insufficient | shallowwell | other | 2011 | 6 |
| 47518 | 0.0 | rwssp | 101.463576 | wedeco | lakevictoria | shinyanga | 2002 | niratanira | handpump | wug | usergroup | neverpay | soft | enough | shallowwell | handpump | 2012 | 10 |
| 47519 | 0.0 | governmentoftanzania | 1290.000000 | districtcouncil | pangani | kilimanjaro | 1976 | gravity | gravity | company | commercial | paymonthly | soft | enough | spring | communalstandpipemultiple | 2013 | 37 |
47520 rows × 18 columns
In [ ]:
X_test
Out[ ]:
| amount_tsh | funder | gps_height | installer | basin | region | construction_year | extraction_type | extraction_type_class | management | management_group | payment | water_quality | quantity | source | waterpoint_type | year_recorded | function_years | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | ruralwatersupplyandsanitat | 1268.308540 | dwe | laketanganyika | shinyanga | 1998 | other | other | wug | usergroup | unknown | soft | dry | shallowwell | other | 2012 | 14 |
| 1 | 0.0 | Other | 1268.308540 | Other | laketanganyika | tabora | 2005 | indiamarkii | handpump | vwc | usergroup | neverpay | milky | insufficient | shallowwell | handpump | 2012 | 7 |
| 2 | 10.0 | Other | 1675.000000 | dwe | internal | manyara | 2008 | gravity | gravity | waterboard | usergroup | payperbucket | soft | insufficient | spring | communalstandpipe | 2013 | 5 |
| 3 | 0.0 | kkkt | 1421.685504 | kkkt | lakerukwa | mbeya | 2000 | niratanira | handpump | vwc | usergroup | neverpay | soft | seasonal | shallowwell | handpump | 2011 | 11 |
| 4 | 50.0 | wateraid | 1109.000000 | sema | internal | singida | 2011 | mono | motorpump | wua | usergroup | payperbucket | soft | enough | machinedbh | communalstandpipemultiple | 2013 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11875 | 200.0 | anglicanchurch | 1775.000000 | angli | lakenyasa | iringa | 2001 | gravity | gravity | vwc | usergroup | paymonthly | soft | enough | spring | communalstandpipe | 2011 | 10 |
| 11876 | 500.0 | governmentoftanzania | 1850.000000 | dwe | pangani | arusha | 1990 | gravity | gravity | vwc | usergroup | paywhenschemefails | soft | enough | spring | other | 2013 | 23 |
| 11877 | 0.0 | danida | 1328.149756 | dwe | lakevictoria | kagera | 1998 | gravity | gravity | vwc | usergroup | neverpay | soft | dry | spring | communalstandpipe | 2011 | 13 |
| 11878 | 0.0 | Other | 642.000000 | Other | wamiruvu | morogoro | 2004 | gravity | gravity | vwc | usergroup | neverpay | soft | insufficient | spring | communalstandpipe | 2011 | 7 |
| 11879 | 0.0 | governmentoftanzania | 1430.000000 | dwe | pangani | tanga | 2007 | gravity | gravity | vwc | usergroup | neverpay | soft | enough | spring | communalstandpipe | 2011 | 4 |
11880 rows × 18 columns
In [ ]:
training_numeric_cols = X_train.select_dtypes(include=['number'])
In [ ]:
corr_matrix = training_numeric_cols.corr()
corr_matrix
Out[ ]:
| amount_tsh | gps_height | construction_year | year_recorded | function_years | |
|---|---|---|---|---|---|
| amount_tsh | 1.000000 | 0.028151 | -0.000417 | -0.002835 | 0.000176 |
| gps_height | 0.028151 | 1.000000 | -0.025917 | 0.035815 | 0.028924 |
| construction_year | -0.000417 | -0.025917 | 1.000000 | 0.030427 | -0.996410 |
| year_recorded | -0.002835 | 0.035815 | 0.030427 | 1.000000 | 0.054296 |
| function_years | 0.000176 | 0.028924 | -0.996410 | 0.054296 | 1.000000 |
In [ ]:
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()
In [ ]:
plt.figure(figsize=(12, 8))
sns.clustermap(corr_matrix, cmap="coolwarm", linewidths=0.5, figsize=(12, 10))
plt.title("Clustered Feature Correlation Map")
plt.show()
<Figure size 1200x800 with 0 Axes>
In [ ]:
categorical_cols
Out[ ]:
['funder', 'installer', 'basin', 'region', 'extraction_type', 'extraction_type_class', 'management', 'management_group', 'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type']
In [ ]:
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
X_train[col] = le.fit_transform(X_train[col])
X_test[col] = le.transform(X_test[col])
label_encoders[col] = le
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col]) /var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/638246821.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[col] = le.fit_transform(X_train[col])
In [ ]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
Decision Tree Accuracy: 0.7505
precision recall f1-score support
0 0.78 0.82 0.80 6457
1 0.36 0.33 0.35 851
2 0.78 0.73 0.75 4572
accuracy 0.75 11880
macro avg 0.64 0.63 0.63 11880
weighted avg 0.75 0.75 0.75 11880
In [ ]:
dt_model = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(dt_model, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Initial Decision Tree")
plt.show()
In [ ]:
class_counts = y_train.value_counts()
class_counts
Out[ ]:
status_group 0 25802 2 18252 1 3466 Name: count, dtype: int64
In [ ]:
plt.figure(figsize=(6,4))
sns.barplot(x=class_counts.index, y=class_counts.values, palette="coolwarm")
plt.xlabel("Status Group (Target)")
plt.ylabel("Count")
plt.title("Class Distribution in Training Set")
plt.xticks(ticks=[0, 1, 2], labels=["Functional", "Needs Repair", "Not Functional"])
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/3244613844.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=class_counts.index, y=class_counts.values, palette="coolwarm")
In [ ]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)
y_pred_resampled = model.predict(X_test)
print(f"Decision Tree Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred_resampled))
Decision Tree Accuracy: 0.7505
precision recall f1-score support
0 0.80 0.75 0.78 6457
1 0.29 0.48 0.37 851
2 0.77 0.74 0.75 4572
accuracy 0.73 11880
macro avg 0.62 0.66 0.63 11880
weighted avg 0.75 0.73 0.74 11880
In [ ]:
cm_smote = confusion_matrix(y_test, y_pred_resampled)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_smote, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - SMOTE Decision Tree")
plt.show()
In [ ]:
dt_model_weighted = DecisionTreeClassifier(random_state=42, class_weight="balanced")
dt_model_weighted.fit(X_train, y_train)
y_pred_weighted = dt_model_weighted.predict(X_test)
print(f"Decision Tree Accuracy (Weighted): {accuracy_score(y_test, y_pred_weighted):.4f}")
print(classification_report(y_test, y_pred_weighted))
Decision Tree Accuracy (Weighted): 0.7278
precision recall f1-score support
0 0.80 0.75 0.78 6457
1 0.29 0.47 0.36 851
2 0.76 0.74 0.75 4572
accuracy 0.73 11880
macro avg 0.62 0.65 0.63 11880
weighted avg 0.75 0.73 0.74 11880
In [ ]:
cm_weighted = confusion_matrix(y_test, y_pred_weighted)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_weighted, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Class Weighted Decision Tree")
plt.show()
In [ ]:
from sklearn.model_selection import GridSearchCV
param_grid = {
'max_depth': [5, 10, 15, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 5, 10],
'class_weight': [None, 'balanced']
}
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
Best Hyperparameters: {'class_weight': None, 'max_depth': 20, 'min_samples_leaf': 5, 'min_samples_split': 2}
In [ ]:
best_dt = DecisionTreeClassifier(**grid_search.best_params_, random_state=42)
best_dt.fit(X_train, y_train)
y_pred_best = best_dt.predict(X_test)
print(f"Optimized Decision Tree Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print(classification_report(y_test, y_pred_best))
Optimized Decision Tree Accuracy: 0.7637
precision recall f1-score support
0 0.76 0.87 0.81 6457
1 0.44 0.27 0.33 851
2 0.81 0.70 0.75 4572
accuracy 0.76 11880
macro avg 0.67 0.61 0.63 11880
weighted avg 0.76 0.76 0.76 11880
In [ ]:
cm_best_dt = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_best_dt, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Optimized Decision Tree")
plt.show()
In [ ]:
another_best_dt = DecisionTreeClassifier(class_weight=None, max_depth=19, min_samples_leaf=6, min_samples_split=4, random_state=42)
another_best_dt.fit(X_train, y_train)
y_pred_another_best = another_best_dt.predict(X_test)
print(f"Optimized Decision Tree Accuracy: {accuracy_score(y_test, y_pred_another_best):.4f}")
print(classification_report(y_test, y_pred_another_best))
Optimized Decision Tree Accuracy: 0.7668
precision recall f1-score support
0 0.76 0.88 0.82 6457
1 0.46 0.23 0.31 851
2 0.81 0.71 0.76 4572
accuracy 0.77 11880
macro avg 0.68 0.61 0.63 11880
weighted avg 0.76 0.77 0.76 11880
In [ ]:
importances = another_best_dt.feature_importances_
feature_names = X_train.columns
# Sort features by importance
indices = np.argsort(importances)[::-1]
# Plot feature importance
plt.figure(figsize=(10, 6))
plt.title("Feature Importance - Decision Tree")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Feature Name")
plt.ylabel("Importance Score")
plt.show()
In [ ]:
# Binarize the labels for multi-class ROC curve
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_pred_prob = another_best_dt.predict_proba(X_test) # Get predicted probabilities
# Plot ROC Curve for each class
plt.figure(figsize=(8, 6))
for i, label in enumerate(["Functional (0)", "Needs Repair (1)", "Not Functional (2)"]):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"{label} (AUC = {auc_score:.2f})")
plt.plot([0, 1], [0, 1], 'k--') # Random guess line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Multi-Class Decision Tree")
plt.legend()
plt.show()
In [ ]:
cm_another_best_dt = confusion_matrix(y_test, y_pred_another_best)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_another_best_dt, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Optimized Decision Tree")
plt.show()
In [ ]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
param_grid = {
'max_depth': [5, 10, 15, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 5, 10],
'class_weight': [None, 'balanced']
}
smote_dt = DecisionTreeClassifier(random_state=42)
grid_search_smote = GridSearchCV(smote_dt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_smote.fit(X_train_smote, y_train_smote)
best_dt_smote = DecisionTreeClassifier(**grid_search_smote.best_params_, random_state=42)
best_dt_smote.fit(X_train_smote, y_train_smote)
y_pred_best_smote = best_dt_smote.predict(X_test)
print("Best Hyperparameters with SMOTE:", grid_search_smote.best_params_)
print(f"Optimized Decision Tree Accuracy with SMOTE: {accuracy_score(y_test, y_pred_best_smote):.4f}")
print(classification_report(y_test, y_pred_best_smote))
Best Hyperparameters with SMOTE: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Optimized Decision Tree Accuracy with SMOTE: 0.7206
precision recall f1-score support
0 0.80 0.74 0.77 6457
1 0.27 0.54 0.36 851
2 0.79 0.72 0.75 4572
accuracy 0.72 11880
macro avg 0.62 0.67 0.63 11880
weighted avg 0.76 0.72 0.74 11880
In [ ]:
importances = best_dt_smote.feature_importances_
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importance - Decision Tree")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Feature Name")
plt.ylabel("Importance Score")
plt.show()
In [ ]:
from sklearn.preprocessing import label_binarize
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_pred_prob = best_dt_smote.predict_proba(X_test)
plt.figure(figsize=(8, 6))
for i, label in enumerate(["Functional (0)", "Needs Repair (1)", "Not Functional (2)"]):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"{label} (AUC = {auc_score:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Multi-Class Decision Tree")
plt.legend()
plt.show()
In [ ]:
# Before SMOTE
plt.figure(figsize=(6, 4))
sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts().values, palette="coolwarm")
plt.xticks(ticks=[0, 1, 2], labels=["Functional", "Needs Repair", "Not Functional"])
plt.title("Class Distribution Before SMOTE")
plt.xlabel("Status Group (Target)")
plt.ylabel("Count")
plt.show()
# After SMOTE
plt.figure(figsize=(6, 4))
sns.barplot(x=y_train_resampled.value_counts().index, y=y_train_resampled.value_counts().values, palette="coolwarm")
plt.xticks(ticks=[0, 1, 2], labels=["Functional", "Needs Repair", "Not Functional"])
plt.title("Class Distribution After SMOTE")
plt.xlabel("Status Group (Target)")
plt.ylabel("Count")
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/1913125478.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts().values, palette="coolwarm")
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/1913125478.py:12: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=y_train_resampled.value_counts().index, y=y_train_resampled.value_counts().values, palette="coolwarm")
In [ ]:
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))
Random Forest Accuracy: 0.7827
precision recall f1-score support
0 0.79 0.86 0.83 6457
1 0.46 0.32 0.38 851
2 0.81 0.76 0.78 4572
accuracy 0.78 11880
macro avg 0.69 0.65 0.66 11880
weighted avg 0.78 0.78 0.78 11880
In [ ]:
param_grid = {
'n_estimators': [50, 100, 200], # Number of trees
'max_depth': [10, 20, None], # Tree depth (None means unlimited)
'min_samples_split': [2, 5, 10], # Minimum samples per split
'min_samples_leaf': [1, 2, 4], # Minimum samples per leaf
'class_weight': [None, 'balanced'] # Handle class imbalance
}
In [ ]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search_rf = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters:", best_params_rf)
Fitting 5 folds for each of 162 candidates, totalling 810 fits
/opt/anaconda3/lib/python3.12/site-packages/numpy/ma/core.py:2820: RuntimeWarning: invalid value encountered in cast _data = np.array(data, dtype=dtype, copy=copy,
Best Hyperparameters: {'class_weight': None, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
In [ ]:
best_rf = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)
# Make predictions
y_pred_best_rf = best_rf.predict(X_test)
# Evaluate performance
from sklearn.metrics import accuracy_score, classification_report
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_best_rf):.4f}")
print(classification_report(y_test, y_pred_best_rf))
Optimized Random Forest Accuracy: 0.7992
precision recall f1-score support
0 0.78 0.92 0.84 6457
1 0.64 0.23 0.34 851
2 0.85 0.74 0.79 4572
accuracy 0.80 11880
macro avg 0.76 0.63 0.66 11880
weighted avg 0.80 0.80 0.79 11880
In [ ]:
importances = best_rf.feature_importances_
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importance - Random Forest")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Feature Name")
plt.ylabel("Importance Score")
plt.show()
In [ ]:
another_best_rf = RandomForestClassifier(class_weight=None, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=200)
another_best_rf.fit(X_train, y_train)
y_pred_another_rf = another_best_rf.predict(X_test)
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_another_rf):.4f}")
print(classification_report(y_test, y_pred_another_rf))
Optimized Random Forest Accuracy: 0.7993
precision recall f1-score support
0 0.78 0.91 0.84 6457
1 0.63 0.23 0.34 851
2 0.85 0.74 0.79 4572
accuracy 0.80 11880
macro avg 0.75 0.63 0.66 11880
weighted avg 0.80 0.80 0.79 11880
In [ ]:
param_dist = {
'n_estimators': [100, 200, 300, 500], # More trees
'max_depth': [10, 20, 30, None], # More depth options
'min_samples_split': [2, 5, 10, 15], # Varying split thresholds
'min_samples_leaf': [1, 2, 4, 8], # More leaf node options
'max_features': ['sqrt', 'log2', None], # Feature selection strategy
'bootstrap': [True, False], # Try both bootstrap settings
'class_weight': [None, 'balanced'] # Class weighting
}
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
random_search_rf = RandomizedSearchCV(rf, param_distributions=param_dist,
n_iter=30, cv=5, scoring='accuracy',
n_jobs=-1, verbose=1, random_state=42)
random_search_rf.fit(X_train, y_train)
best_params_rf = random_search_rf.best_params_
print("Optimized Hyperparameters:", best_params_rf)
Fitting 5 folds for each of 30 candidates, totalling 150 fits
/opt/anaconda3/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak. warnings.warn(
Optimized Hyperparameters: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'class_weight': None, 'bootstrap': True}
In [ ]:
best_rf_rs = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
best_rf_rs.fit(X_train, y_train)
y_pred_rf_rs = best_rf_rs.predict(X_test)
print(f"Final Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf_rs):.4f}")
print(classification_report(y_test, y_pred_rf_rs))
Final Optimized Random Forest Accuracy: 0.7997
precision recall f1-score support
0 0.79 0.90 0.84 6457
1 0.59 0.26 0.36 851
2 0.84 0.76 0.80 4572
accuracy 0.80 11880
macro avg 0.74 0.64 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
importances = best_rf_rs.feature_importances_
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importance - Random Forest, Random Selector")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Feature Name")
plt.ylabel("Importance Score")
plt.show()
In [ ]:
rf_rs = RandomForestClassifier(n_estimators=500, min_samples_split=11, min_samples_leaf=2, max_features='log2', max_depth=30, class_weight=None, bootstrap=True)
rf_rs.fit(X_train, y_train)
y_predict_rf_rs = rf_rs.predict(X_test)
print(f"Final Optimized Random Forest Accuracy: {accuracy_score(y_test, y_predict_rf_rs):.4f}")
print(classification_report(y_test, y_predict_rf_rs))
Final Optimized Random Forest Accuracy: 0.7992
precision recall f1-score support
0 0.78 0.91 0.84 6457
1 0.60 0.23 0.34 851
2 0.85 0.75 0.79 4572
accuracy 0.80 11880
macro avg 0.74 0.63 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
cm_rf = confusion_matrix(y_test, y_predict_rf_rs)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Random Forest(RandomSearch)")
plt.show()
In [ ]:
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_pred_prob = best_rf_rs.predict_proba(X_test)
# Plot ROC Curve for each class
plt.figure(figsize=(8, 6))
for i, label in enumerate(["Functional (0)", "Needs Repair (1)", "Not Functional (2)"]):
fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
auc_score = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"{label} (AUC = {auc_score:.2f})")
plt.plot([0, 1], [0, 1], 'k--') # Random guess line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest(RandomSearch)")
plt.legend()
plt.show()
In [ ]:
custom_weights = {0: 0.5, 1: 1, 2: 1} # Reduce influence of Class 0
rf_weighted = RandomForestClassifier(n_estimators=400, max_depth=21, class_weight=custom_weights, random_state=42, n_jobs=-1)
rf_weighted.fit(X_train, y_train)
y_pred_rf_weighted = rf_weighted.predict(X_test)
print(f"Weighted Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf_weighted):.4f}")
print(classification_report(y_test, y_pred_rf_weighted))
Weighted Random Forest Accuracy: 0.7875
precision recall f1-score support
0 0.81 0.84 0.83 6457
1 0.49 0.34 0.40 851
2 0.79 0.80 0.79 4572
accuracy 0.79 11880
macro avg 0.70 0.66 0.67 11880
weighted avg 0.78 0.79 0.78 11880
In [ ]:
cv_scores = cross_val_score(another_best_rf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
Cross-Validation Accuracy Scores: [0.79640152 0.79377104 0.79377104 0.80039983 0.79114057] Mean CV Accuracy: 0.7951 Standard Deviation: 0.0031
In [ ]:
print(f"Total rows: {len(X_train)}")
print(f"Zero values in amount_tsh: {(X_train['amount_tsh'] == 0).sum()} ({(X_train['amount_tsh'] == 0).mean()*100:.2f}%)")
Total rows: 47520 Zero values in amount_tsh: 33331 (70.14%)
In [ ]:
X_train = X_train.drop(columns=['amount_tsh'])
X_test = X_test.drop(columns=['amount_tsh'])
In [ ]:
custom_weights = {0: 0.5, 1: 1, 2: 1}
rf_weighted = RandomForestClassifier(n_estimators=400, max_depth=20, class_weight=None, random_state=42, n_jobs=-1)
rf_weighted.fit(X_train, y_train)
y_pred_rf_weighted = rf_weighted.predict(X_test)
print(f"Weighted Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf_weighted):.4f}")
print(classification_report(y_test, y_pred_rf_weighted))
Weighted Random Forest Accuracy: 0.7960
precision recall f1-score support
0 0.79 0.90 0.84 6457
1 0.54 0.27 0.36 851
2 0.84 0.74 0.79 4572
accuracy 0.80 11880
macro avg 0.72 0.64 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
best_rf_rs = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
best_rf_rs.fit(X_train, y_train)
y_pred_rf_rs = best_rf_rs.predict(X_test)
print(f"Final Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf_rs):.4f}")
print(classification_report(y_test, y_pred_rf_rs))
Final Optimized Random Forest Accuracy: 0.7979
precision recall f1-score support
0 0.79 0.90 0.84 6457
1 0.58 0.26 0.36 851
2 0.84 0.76 0.79 4572
accuracy 0.80 11880
macro avg 0.73 0.64 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
cv_scores_rs = cross_val_score(best_rf_rs, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
Cross-Validation Accuracy Scores: [0.79640152 0.79377104 0.79377104 0.80039983 0.79114057] Mean CV Accuracy: 0.7951 Standard Deviation: 0.0031
In [ ]:
X_train['height_per_year'] = X_train['gps_height'] / (X_train['function_years'] + 1)
X_test['height_per_year'] = X_test['gps_height'] / (X_test['function_years'] + 1)
In [ ]:
X_train['gps_height_bin'] = pd.cut(X_train['gps_height'], bins=[-50, 500, 1500, 3000], labels=['low', 'medium', 'high'])
X_test['gps_height_bin'] = pd.cut(X_test['gps_height'], bins=[-50, 500, 1500, 3000], labels=['low', 'medium', 'high'])
In [ ]:
X_train['function_age_group'] = pd.cut(X_train['function_years'], bins=[-1, 5, 15, 30, 100], labels=['new', 'mid', 'old', 'very_old'])
X_test['function_age_group'] = pd.cut(X_test['function_years'], bins=[-1, 5, 15, 30, 100], labels=['new', 'mid', 'old', 'very_old'])
In [ ]:
new_categorical_cols = ['gps_height_bin', 'function_age_group']
le = LabelEncoder()
for col in new_categorical_cols:
X_train[col] = le.fit_transform(X_train[col])
X_test[col] = le.transform(X_test[col])
In [ ]:
print("Missing values per column:\n", X_train.isna().sum())
Missing values per column: funder 0 gps_height 0 installer 0 basin 0 region 0 construction_year 0 extraction_type 0 extraction_type_class 0 management 0 management_group 0 payment 0 water_quality 0 quantity 0 source 0 waterpoint_type 0 year_recorded 0 function_years 0 height_per_year 0 gps_height_bin 0 function_age_group 0 dtype: int64
In [ ]:
print("Infinite values per column:\n", X_train.replace([np.inf, -np.inf], np.nan).isna().sum())
Infinite values per column: funder 0 gps_height 0 installer 0 basin 0 region 0 construction_year 0 extraction_type 0 extraction_type_class 0 management 0 management_group 0 payment 0 water_quality 0 quantity 0 source 0 waterpoint_type 0 year_recorded 0 function_years 0 height_per_year 1 gps_height_bin 0 function_age_group 0 dtype: int64
In [ ]:
X_train['height_per_year'] = X_train['height_per_year'].replace([np.inf, -np.inf], np.nan)
X_train['height_per_year'] = X_train['height_per_year'].fillna(X_train['height_per_year'].median())
X_test['height_per_year'] = X_test['height_per_year'].replace([np.inf, -np.inf], np.nan)
X_test['height_per_year'] = X_test['height_per_year'].fillna(X_test['height_per_year'].median())
In [ ]:
new_rf_rs = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
new_rf_rs.fit(X_train, y_train)
y_pred_new_rf_rs = new_rf_rs.predict(X_test)
print(f"Final Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_new_rf_rs):.4f}")
print(classification_report(y_test, y_pred_new_rf_rs))
Final Optimized Random Forest Accuracy: 0.7969
precision recall f1-score support
0 0.79 0.90 0.84 6457
1 0.58 0.26 0.36 851
2 0.83 0.75 0.79 4572
accuracy 0.80 11880
macro avg 0.73 0.64 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
upper_limit = max(201, X_train['quantity'].max() + 1) # Ensure it's strictly greater than 200
X_train['quantity_bin'] = pd.cut(X_train['quantity'], bins=[-1, 0, 50, 200, upper_limit],
labels=['no_water', 'small', 'medium', 'large'], include_lowest=True)
X_test['quantity_bin'] = pd.cut(X_test['quantity'], bins=[-1, 0, 50, 200, upper_limit],
labels=['no_water', 'small', 'medium', 'large'], include_lowest=True)
In [ ]:
print(X_train['quantity_bin'].isna().sum())
print(X_test['quantity_bin'].isna().sum())
0 0
In [ ]:
le = LabelEncoder()
X_train['quantity_bin'] = le.fit_transform(X_train['quantity_bin'])
X_test['quantity_bin'] = le.transform(X_test['quantity_bin'])
In [ ]:
quantitybin_rf_rs = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
quantitybin_rf_rs.fit(X_train, y_train)
y_pred_quantitybin_rf_rs = quantitybin_rf_rs.predict(X_test)
print(f"Final Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_quantitybin_rf_rs):.4f}")
print(classification_report(y_test, y_pred_quantitybin_rf_rs))
Final Optimized Random Forest Accuracy: 0.7973
precision recall f1-score support
0 0.79 0.90 0.84 6457
1 0.58 0.26 0.36 851
2 0.84 0.75 0.79 4572
accuracy 0.80 11880
macro avg 0.73 0.64 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
cm_quantity_bin_rf = confusion_matrix(y_test, y_pred_quantitybin_rf_rs)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Random Forest(Quantity Bin)")
plt.show()
In [ ]:
X_train['construction_age'] = pd.cut(X_train['construction_year'], bins=[0, 1960, 1980, 2000, X_train['construction_year'].max()],
labels=['very_old', 'old', 'mid_age', 'new'])
X_test['construction_age'] = pd.cut(X_test['construction_year'], bins=[0, 1960, 1980, 2000, X_test['construction_year'].max()],
labels=['very_old', 'old', 'mid_age', 'new'])
In [ ]:
le = LabelEncoder()
X_train['construction_age'] = le.fit_transform(X_train['construction_age'])
X_test['construction_age'] = le.transform(X_test['construction_age'])
In [ ]:
construction_bin_rf = RandomForestClassifier(class_weight=None, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=200)
construction_bin_rf.fit(X_train, y_train)
y_pred_construction_bin_rf = construction_bin_rf.predict(X_test)
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_construction_bin_rf):.4f}")
print(classification_report(y_test, y_pred_construction_bin_rf))
Optimized Random Forest Accuracy: 0.7993
precision recall f1-score support
0 0.78 0.92 0.84 6457
1 0.63 0.23 0.34 851
2 0.85 0.74 0.79 4572
accuracy 0.80 11880
macro avg 0.75 0.63 0.66 11880
weighted avg 0.80 0.80 0.79 11880
In [ ]:
construction_bin_rf = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
construction_bin_rf.fit(X_train, y_train)
y_pred_construction_bin_rf = construction_bin_rf.predict(X_test)
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_construction_bin_rf):.4f}")
print(classification_report(y_test, y_pred_construction_bin_rf))
Optimized Random Forest Accuracy: 0.7980
precision recall f1-score support
0 0.79 0.90 0.84 6457
1 0.58 0.26 0.36 851
2 0.84 0.75 0.79 4572
accuracy 0.80 11880
macro avg 0.73 0.64 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
X_train['extraction_modernity'] = X_train['extraction_type_class'].apply(lambda x: 'modern' if x in ['gravity', 'motorpump'] else 'traditional')
X_test['extraction_modernity'] = X_test['extraction_type_class'].apply(lambda x: 'modern' if x in ['gravity', 'motorpump'] else 'traditional')
In [ ]:
le = LabelEncoder()
X_train['extraction_modernity'] = le.fit_transform(X_train['extraction_modernity'])
X_test['extraction_modernity'] = le.transform(X_test['extraction_modernity'])
In [ ]:
extraction_bin_rf = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
extraction_bin_rf.fit(X_train, y_train)
y_pred_extraction_bin_rf = extraction_bin_rf.predict(X_test)
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_extraction_bin_rf):.4f}")
print(classification_report(y_test, y_pred_extraction_bin_rf))
Optimized Random Forest Accuracy: 0.7976
precision recall f1-score support
0 0.79 0.90 0.84 6457
1 0.57 0.25 0.35 851
2 0.84 0.75 0.79 4572
accuracy 0.80 11880
macro avg 0.73 0.63 0.66 11880
weighted avg 0.79 0.80 0.79 11880
In [ ]:
extraction_bin_rf = RandomForestClassifier(class_weight=None, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=200)
extraction_bin_rf.fit(X_train, y_train)
y_pred_extraction_bin_rf = extraction_bin_rf.predict(X_test)
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_extraction_bin_rf):.4f}")
print(classification_report(y_test, y_pred_extraction_bin_rf))
Optimized Random Forest Accuracy: 0.7995
precision recall f1-score support
0 0.78 0.92 0.84 6457
1 0.62 0.23 0.34 851
2 0.85 0.74 0.79 4572
accuracy 0.80 11880
macro avg 0.75 0.63 0.66 11880
weighted avg 0.80 0.80 0.79 11880
In [ ]:
feature_cv_scores = cross_val_score(extraction_bin_rf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {feature_cv_scores}")
print(f"Mean CV Accuracy: {feature_cv_scores.mean():.4f}")
print(f"Standard Deviation: {feature_cv_scores.std():.4f}")
Cross-Validation Accuracy Scores: [0.79208754 0.7924032 0.79324495 0.79776936 0.79114057] Mean CV Accuracy: 0.7933 Standard Deviation: 0.0023
In [ ]:
final_bin_cm_rf = confusion_matrix(y_test, y_pred_extraction_bin_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Random Forest(Feature Engineered)")
plt.show()
In [ ]:
cm_trial = confusion_matrix(y_test, y_pred_extraction_bin_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Random Forest(Error Analysis)")
plt.show()
In [ ]:
fp_0 = cm_trial[1, 0] + cm_trial[2, 0] # False positives for class 0
fp_1 = cm_trial[0, 1] + cm_trial[2, 1] # False positives for class 1
fp_2 = cm_trial[0, 2] + cm_trial[1, 2] # False positives for class 2
print(f"False Positives for class 0 (Functional): {fp_0}")
print(f"False Positives for class 1 (Functional Needs Repair): {fp_1}")
print(f"False Positives for class 2 (Not Functional): {fp_2}")
False Positives for class 0 (Functional): 1689 False Positives for class 1 (Functional Needs Repair): 118 False Positives for class 2 (Not Functional): 575
In [ ]:
extraction_bin_rf = RandomForestClassifier(class_weight=None, max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=200)
extraction_bin_rf.fit(X_train, y_train)
y_pred_extraction_bin_rf = extraction_bin_rf.predict(X_test)
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_extraction_bin_rf):.4f}")
print(classification_report(y_test, y_pred_extraction_bin_rf))
Optimized Random Forest Accuracy: 0.7984
precision recall f1-score support
0 0.78 0.92 0.84 6457
1 0.63 0.24 0.35 851
2 0.85 0.74 0.79 4572
accuracy 0.80 11880
macro avg 0.75 0.63 0.66 11880
weighted avg 0.80 0.80 0.79 11880
In [ ]:
feature_cv_scores = cross_val_score(extraction_bin_rf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {feature_cv_scores}")
print(f"Mean CV Accuracy: {feature_cv_scores.mean():.4f}")
print(f"Standard Deviation: {feature_cv_scores.std():.4f}")
Cross-Validation Accuracy Scores: [0.79398148 0.7940867 0.79377104 0.79776936 0.79050926] Mean CV Accuracy: 0.7940 Standard Deviation: 0.0023
In [ ]:
# Count how many times the model predicted 2 when the actual class was 0 or 1
fp_2_from_0 = ((y_pred == 2) & (y_test == 0)).sum()
fp_2_from_1 = ((y_pred == 2) & (y_test == 1)).sum()
# Sum them up
actual_fp_class_2 = fp_2_from_0 + fp_2_from_1
print(f"False Positives for Class 2 (Not Functional):")
print(f" - Model predicted 2 but true class was 0: {fp_2_from_0}")
print(f" - Model predicted 2 but true class was 1: {fp_2_from_1}")
print(f" - Total False Positives for Class 2 (Should match Confusion Matrix 632): {actual_fp_class_2}")
False Positives for Class 2 (Not Functional): - Model predicted 2 but true class was 0: 816 - Model predicted 2 but true class was 1: 140 - Total False Positives for Class 2 (Should match Confusion Matrix 632): 956
In [ ]:
cv_scores = [0.79198232, 0.79313973, 0.79303451, 0.79734848, 0.78966751] # Replace with actual scores
plt.figure(figsize=(6, 4))
sns.boxplot(cv_scores, orient="h")
plt.title("Cross-Validation Accuracy Distribution")
plt.xlabel("Accuracy Score")
plt.show()
In [ ]:
report = classification_report(y_test, y_pred, output_dict=True)
# Convert to DataFrame
df_report = pd.DataFrame(report).T.iloc[:-1, :] # Drop 'accuracy' row
# Plot
sns.heatmap(df_report[['precision', 'recall', 'f1-score']].plot(kind='bar', figsize=(8, 5), rot=0)
plt.title("Precision, Recall, and F1-Score by Class")
plt.ylabel("Score")
plt.xlabel("Class")
plt.legend(title="Metric")
plt.show()
Cell In[797], line 7 sns.heatmap(df_report[['precision', 'recall', 'f1-score']].plot(kind='bar', figsize=(8, 5), rot=0) ^ SyntaxError: '(' was never closed
In [ ]:
corr_matrix = X_train.corr()
# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", linewidths=0.5)
# Titles and labels
plt.title("Feature Correlation Heatmap")
plt.show()
In [ ]:
X_train = X_train.drop(columns=['gps_height', 'function_years', 'quantity',
'construction_year', 'extraction_type', 'extraction_type_class'])
X_test = X_test.drop(columns=['gps_height', 'function_years', 'quantity',
'construction_year', 'extraction_type', 'extraction_type_class'])
In [ ]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
In [ ]:
print(X_train.head())
funder installer basin region management management_group payment \ 0 19 25 0 8 9 4 4 1 1 34 0 2 7 4 0 2 42 50 2 10 7 4 0 3 82 50 6 10 7 4 3 4 40 48 8 11 7 4 5 water_quality source waterpoint_type year_recorded height_per_year \ 0 6 8 1 2013 130.750000 1 6 7 4 2011 113.412309 2 6 7 6 2011 118.473792 3 6 6 1 2011 90.709137 4 4 7 6 2011 68.200000 gps_height_bin function_age_group quantity_bin construction_age \ 0 0 0 1 0 1 2 0 1 0 2 2 0 1 0 3 2 0 1 0 4 2 0 1 0 extraction_modernity 0 0 1 0 2 0 3 0 4 0
In [ ]:
updated_rf = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
updated_rf.fit(X_train, y_train)
y_pred_updated_rf = updated_rf.predict(X_test)
print(f"Optimized Random Forest Accuracy: {accuracy_score(y_test, y_pred_updated_rf):.4f}")
print(classification_report(y_test, y_pred_updated_rf))
Optimized Random Forest Accuracy: 0.7856
precision recall f1-score support
0 0.78 0.89 0.83 6457
1 0.57 0.23 0.33 851
2 0.82 0.74 0.78 4572
accuracy 0.79 11880
macro avg 0.72 0.62 0.64 11880
weighted avg 0.78 0.79 0.77 11880
In [ ]:
corr_matrix_updated = X_train.corr()
# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix_updated, annot=False, cmap="coolwarm", linewidths=0.5)
# Titles and labels
plt.title("Feature Correlation Heatmap")
plt.show()
In [ ]:
updated_cv_scores = cross_val_score(updated_rf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {feature_cv_scores}")
print(f"Mean CV Accuracy: {feature_cv_scores.mean():.4f}")
print(f"Standard Deviation: {feature_cv_scores.std():.4f}")
Cross-Validation Accuracy Scores: [0.79198232 0.79313973 0.79303451 0.79734848 0.78966751] Mean CV Accuracy: 0.7930 Standard Deviation: 0.0025
In [ ]:
y_train = y_train.reset_index(drop=True)
In [ ]:
water_quality_needs_repair = X_train[y_train == 1]['water_quality'].value_counts()
In [ ]:
plt.figure(figsize=(10, 6))
sns.barplot(x=water_quality_needs_repair.index, y=water_quality_needs_repair.values, palette="coolwarm")
plt.title("Water Quality Distribution for Pumps That Need Repair")
plt.xlabel("Water Quality")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/3586136830.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=water_quality_needs_repair.index, y=water_quality_needs_repair.values, palette="coolwarm")
In [ ]:
print(X_train['water_quality'].unique())
[6 4 7 0 1 3 5 2]
In [ ]:
encoded_classes = le.classes_ # This retrieves the original labels in the order they were encoded
print(encoded_classes) # Check what each number represents
['traditional']
In [ ]:
print(X_train[['water_quality']].drop_duplicates().sort_values(by='water_quality'))
water_quality 28 0 31 1 4542 2 38 3 4 4 691 5 0 6 27 7
In [ ]:
water_quality_mapping = {
0: 'good',
1: 'salty',
2: 'milky',
3: 'coloured',
4: 'fluoride',
5: 'unknown',
6: 'other'
}
In [ ]:
water_quality_needs_repair.index = water_quality_needs_repair.index.map(water_quality_mapping)
plt.figure(figsize=(10, 6))
sns.barplot(x=water_quality_needs_repair.index, y=water_quality_needs_repair.values, palette="coolwarm")
plt.title("Water Quality Distribution for Pumps That Need Repair")
plt.xlabel("Water Quality")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/1893826344.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=water_quality_needs_repair.index, y=water_quality_needs_repair.values, palette="coolwarm")
In [ ]:
# Merge Needs Repair (1) into Not Functional (2)
y_train = y_train.replace({1: 2})
y_test = y_test.replace({1: 2})
In [ ]:
rf_model = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))
Random Forest Accuracy: 0.8072
precision recall f1-score support
0 0.80 0.87 0.83 6457
2 0.82 0.74 0.78 5423
accuracy 0.81 11880
macro avg 0.81 0.80 0.80 11880
weighted avg 0.81 0.81 0.81 11880
In [ ]:
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap="Blues", xticklabels=["Functional", "Non-Functional"], yticklabels=["Functional", "Non-Functional"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Merged Classes")
plt.show()
In [ ]:
merged_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {feature_cv_scores}")
print(f"Mean CV Accuracy: {feature_cv_scores.mean():.4f}")
print(f"Standard Deviation: {feature_cv_scores.std():.4f}")
Cross-Validation Accuracy Scores: [0.79198232 0.79313973 0.79303451 0.79734848 0.78966751] Mean CV Accuracy: 0.7930 Standard Deviation: 0.0025
In [ ]:
water_quality_functional = X_train[y_train == 0]['water_quality'].value_counts()
In [ ]:
water_quality_functional.index = water_quality_functional.index.map(water_quality_mapping)
plt.figure(figsize=(10, 6))
sns.barplot(x=water_quality_functional.index, y=water_quality_functional.values, palette="coolwarm")
plt.title("Water Quality Distribution for Pumps That Are Functional")
plt.xlabel("Water Quality")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/2128161353.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=water_quality_functional.index, y=water_quality_functional.values, palette="coolwarm")
In [ ]:
# Extract feature importance from the trained Random Forest model
importances = rf_model.feature_importances_
feature_names = X_train.columns
# Convert to DataFrame
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
In [ ]:
X_train_reduced = X_train.drop(columns=['water_quality'])
X_test_reduced = X_test.drop(columns=['water_quality'])
rf_model_reduced = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model_reduced.fit(X_train_reduced, y_train)
y_pred_reduced = rf_model_reduced.predict(X_test_reduced)
print(f"Random Forest Accuracy Without Water Quality: {accuracy_score(y_test, y_pred_reduced):.4f}")
print(classification_report(y_test, y_pred_reduced))
Random Forest Accuracy Without Water Quality: 0.8048
precision recall f1-score support
0 0.80 0.86 0.83 6457
2 0.82 0.74 0.78 5423
accuracy 0.80 11880
macro avg 0.81 0.80 0.80 11880
weighted avg 0.81 0.80 0.80 11880
In [ ]:
X_train = X_train.drop(columns=['water_quality'])
X_test = X_test.drop(columns=['water_quality'])
rf_model_final = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model_final.fit(X_train, y_train)
y_pred_final = rf_model_final.predict(X_test)
print(f"Final Random Forest Accuracy (Without Water Quality): {accuracy_score(y_test, y_pred_final):.4f}")
print(classification_report(y_test, y_pred_final))
Final Random Forest Accuracy (Without Water Quality): 0.8048
precision recall f1-score support
0 0.80 0.86 0.83 6457
2 0.82 0.74 0.78 5423
accuracy 0.80 11880
macro avg 0.81 0.80 0.80 11880
weighted avg 0.81 0.80 0.80 11880
In [ ]:
importances = rf_model_final.feature_importances_
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importance - Final Random Forest")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
plt.xlabel("Feature Name")
plt.ylabel("Importance Score")
plt.show()
In [ ]:
print(X_train['extraction_modernity'].value_counts())
extraction_modernity 0 47520 Name: count, dtype: int64
In [ ]:
# Drop extraction_modernity and retrain
X_train_reduced = X_train.drop(columns=['extraction_modernity'])
X_test_reduced = X_test.drop(columns=['extraction_modernity'])
rf_model_reduced = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model_reduced.fit(X_train_reduced, y_train)
y_pred_reduced = rf_model_reduced.predict(X_test_reduced)
# Evaluate
print(f"Random Forest Accuracy Without Extraction Modernity: {accuracy_score(y_test, y_pred_reduced):.4f}")
print(classification_report(y_test, y_pred_reduced))
Random Forest Accuracy Without Extraction Modernity: 0.8056
precision recall f1-score support
0 0.80 0.86 0.83 6457
2 0.82 0.74 0.78 5423
accuracy 0.81 11880
macro avg 0.81 0.80 0.80 11880
weighted avg 0.81 0.81 0.80 11880
In [ ]:
X_train = X_train.drop(columns=['extraction_modernity'])
X_test = X_test.drop(columns=['extraction_modernity'])
In [ ]:
rf_model_final = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model_final.fit(X_train, y_train)
y_pred_final = rf_model_final.predict(X_test)
print(f"Final Random Forest Accuracy (Without Water Quality): {accuracy_score(y_test, y_pred_final):.4f}")
print(classification_report(y_test, y_pred_final))
Final Random Forest Accuracy (Without Water Quality): 0.8056
precision recall f1-score support
0 0.80 0.86 0.83 6457
2 0.82 0.74 0.78 5423
accuracy 0.81 11880
macro avg 0.81 0.80 0.80 11880
weighted avg 0.81 0.81 0.80 11880
In [ ]:
y_test = y_test.reset_index(drop=True)
y_pred_rf = pd.Series(y_pred_rf, index=y_test.index)
false_positives = X_test[(y_test == 0) & (y_pred_rf == 2)]
true_functional = X_test[y_test == 0]
In [ ]:
plt.figure(figsize=(8, 5))
sns.histplot(false_positives['height_per_year'], color='red', label='False Positives', kde=True)
sns.histplot(true_functional['height_per_year'], color='blue', label='True Functional', kde=True)
plt.legend()
plt.title("Distribution of Height Per Year (False Positives vs. True Functional)")
plt.show()
In [ ]:
print(false_positives['quantity_bin'].value_counts(normalize=True))
print(true_functional['quantity_bin'].value_counts(normalize=True))
quantity_bin 1 0.972382 0 0.027618 Name: proportion, dtype: float64 quantity_bin 1 0.995664 0 0.004336 Name: proportion, dtype: float64
In [ ]:
print(false_positives['funder'].value_counts().head(10))
funder 29 191 1 158 82 70 42 52 32 39 13 38 89 19 20 16 70 15 17 14 Name: count, dtype: int64
In [ ]:
print(false_positives['waterpoint_type'].value_counts())
waterpoint_type 1 400 4 305 2 86 6 71 5 4 0 2 3 1 Name: count, dtype: int64
In [ ]:
# Define bins and labels for height_per_year. Hoping by binning into broader categories, the model won't over-weight lower height values🤞🏽
bins = [-np.inf, 0, 50, 250, 600, np.inf]
labels = ['low', 'very_low', 'moderate', 'high', 'very_high']
X_train['height_per_year_bin'] = pd.cut(X_train['height_per_year'], bins=bins, labels=labels)
X_test['height_per_year_bin'] = pd.cut(X_test['height_per_year'], bins=bins, labels=labels)
X_train = X_train.drop(columns=['height_per_year'])
X_test = X_test.drop(columns=['height_per_year'])
In [ ]:
label_encoder = LabelEncoder()
X_train['height_per_year_bin'] = label_encoder.fit_transform(X_train['height_per_year_bin'])
X_test['height_per_year_bin'] = label_encoder.transform(X_test['height_per_year_bin'])
In [ ]:
# Run this Random Forest model back!
rf_model_updated = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model_updated.fit(X_train, y_train)
y_pred_updated = rf_model_updated.predict(X_test)
print(f"Random Forest Accuracy After Binning: {accuracy_score(y_test, y_pred_updated):.4f}")
print(classification_report(y_test, y_pred_updated))
Random Forest Accuracy After Binning: 0.8000
precision recall f1-score support
0 0.79 0.86 0.82 6457
2 0.81 0.73 0.77 5423
accuracy 0.80 11880
macro avg 0.80 0.79 0.80 11880
weighted avg 0.80 0.80 0.80 11880
In [ ]:
importances = rf_model_updated.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance_df["Importance"], y=feature_importance_df["Feature"], palette="Blues_r")
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.title("Feature Importance - Random Forest (After Binning)")
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/3595410040.py:8: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=feature_importance_df["Importance"], y=feature_importance_df["Feature"], palette="Blues_r")
In [ ]:
plt.figure(figsize=(8, 5))
sns.histplot(false_positives['quantity_bin'], color='red', label='False Positives', kde=True)
sns.histplot(true_functional['quantity_bin'], color='blue', label='True Functional', kde=True)
plt.legend()
plt.title("Distribution of Quantity (False Positives vs. True Functional)")
plt.show()
In [ ]:
plt.figure(figsize=(8, 5))
sns.histplot(false_positives['funder'], color='red', label='False Positives', kde=True)
sns.histplot(true_functional['funder'], color='blue', label='True Functional', kde=True)
plt.legend()
plt.title("Distribution of Funder (False Positives vs. True Functional)")
plt.show()
In [ ]:
false_positive_funders = false_positives['funder'].value_counts().head(10)
false_positive_funders
Out[ ]:
funder 29 191 1 158 82 70 42 52 32 39 13 38 89 19 20 16 70 15 17 14 Name: count, dtype: int64
In [ ]:
true_functional_funders = true_functional['funder'].value_counts().head(10)
true_functional_funders
Out[ ]:
funder 1 1458 29 744 82 425 13 339 32 164 70 162 90 146 62 130 42 130 81 129 Name: count, dtype: int64
In [ ]:
df_false_funders = pd.DataFrame({'Funder': false_positive_funders.index, 'False Positive Count': false_positive_funders.values})
df_true_funders = pd.DataFrame({'Funder': true_functional_funders.index, 'True Functional Count': true_functional_funders.values})
plt.figure(figsize=(10, 5))
sns.barplot(y=df_false_funders["Funder"], x=df_false_funders["False Positive Count"], palette="Reds_r")
plt.xlabel("False Positive Count")
plt.ylabel("Funder")
plt.title("Top Funders in False Positives")
plt.show()
plt.figure(figsize=(10, 5))
sns.barplot(y=df_true_funders["Funder"], x=df_true_funders["True Functional Count"], palette="Blues_r")
plt.xlabel("True Functional Count")
plt.ylabel("Funder")
plt.title("Top Funders in True Functional Pumps")
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/741957867.py:5: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(y=df_false_funders["Funder"], x=df_false_funders["False Positive Count"], palette="Reds_r")
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/741957867.py:12: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(y=df_true_funders["Funder"], x=df_true_funders["True Functional Count"], palette="Blues_r")
In [ ]:
# Step 1: Identify the top 5 most common funders
top_5_funders = X_train['funder'].value_counts().index[:5]
In [ ]:
# Step 2: Group funders (keep top 5, set others as "Other")
X_train['funder_grouped'] = X_train['funder'].apply(lambda x: x if x in top_5_funders else 'Other')
X_test['funder_grouped'] = X_test['funder'].apply(lambda x: x if x in top_5_funders else 'Other')
In [ ]:
# Ensure all funders are strings before encoding
X_train['funder_grouped'] = X_train['funder_grouped'].astype(str)
X_test['funder_grouped'] = X_test['funder_grouped'].astype(str)
# Now encode the grouped funder column
funder_encoder = LabelEncoder()
X_train['funder_grouped'] = funder_encoder.fit_transform(X_train['funder_grouped'])
X_test['funder_grouped'] = funder_encoder.transform(X_test['funder_grouped']) # Apply the same mapping to test set
In [ ]:
# Step 4: Train the Random Forest model with reduced funder categories
rf_model_reduced_funder = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model_reduced_funder.fit(X_train, y_train)
y_pred_reduced_funder = rf_model_reduced_funder.predict(X_test)
accuracy_reduced_funder = accuracy_score(y_test, y_pred_reduced_funder)
classification_report_reduced_funder = classification_report(y_test, y_pred_reduced_funder)
print(f"Random Forest Accuracy After Grouping Funders: {accuracy_reduced_funder:.4f}")
print(classification_report_reduced_funder)
Random Forest Accuracy After Grouping Funders: 0.8004
precision recall f1-score support
0 0.79 0.86 0.82 6457
2 0.81 0.73 0.77 5423
accuracy 0.80 11880
macro avg 0.80 0.79 0.80 11880
weighted avg 0.80 0.80 0.80 11880
In [ ]:
top_10_funders = X_train['funder'].value_counts().index[:10]
In [ ]:
X_train['funder_grouped'] = X_train['funder'].apply(lambda x: x if x in top_10_funders else 'Other')
X_test['funder_grouped'] = X_test['funder'].apply(lambda x: x if x in top_10_funders else 'Other')
In [ ]:
X_train['funder_grouped'] = X_train['funder_grouped'].astype(str)
X_test['funder_grouped'] = X_test['funder_grouped'].astype(str)
funder_encoder = LabelEncoder()
X_train['funder_grouped'] = funder_encoder.fit_transform(X_train['funder_grouped'])
X_test['funder_grouped'] = funder_encoder.transform(X_test['funder_grouped'])
In [ ]:
# Step 4: Train the Random Forest model with reduced funder categories
rf_model_reduced_funder_10 = RandomForestClassifier(**best_params_rf, random_state=42, n_jobs=-1)
rf_model_reduced_funder_10.fit(X_train, y_train)
y_pred_reduced_funder_10 = rf_model_reduced_funder_10.predict(X_test)
accuracy_reduced_funder_10 = accuracy_score(y_test, y_pred_reduced_funder_10)
classification_report_reduced_funder_10 = classification_report(y_test, y_pred_reduced_funder_10)
print(f"Random Forest Accuracy After Grouping Funders: {accuracy_reduced_funder_10:.4f}")
print(classification_report_reduced_funder_10)
Random Forest Accuracy After Grouping Funders: 0.7994
precision recall f1-score support
0 0.79 0.86 0.82 6457
2 0.81 0.73 0.77 5423
accuracy 0.80 11880
macro avg 0.80 0.79 0.80 11880
weighted avg 0.80 0.80 0.80 11880
In [ ]:
plt.figure(figsize=(8, 5))
sns.histplot(false_positives['waterpoint_type'], color='red', label='False Positives', kde=True)
sns.histplot(true_functional['waterpoint_type'], color='blue', label='True Functional', kde=True)
plt.legend()
plt.title("Distribution of Waterpoint Type (False Positives vs. True Functional)")
plt.show()
In [ ]:
waterpoint_counts = X_train['waterpoint_type'].value_counts()
waterpoint_counts
Out[ ]:
waterpoint_type 1 22785 4 14073 6 5098 2 4830 5 639 0 91 3 4 Name: count, dtype: int64
In [ ]:
df_waterpoint_counts = pd.DataFrame({'Waterpoint Type': waterpoint_counts.index, 'Count': waterpoint_counts.values})
df_waterpoint_counts
Out[ ]:
| Waterpoint Type | Count | |
|---|---|---|
| 0 | 1 | 22785 |
| 1 | 4 | 14073 |
| 2 | 6 | 5098 |
| 3 | 2 | 4830 |
| 4 | 5 | 639 |
| 5 | 0 | 91 |
| 6 | 3 | 4 |
In [ ]:
# Plot distribution of waterpoint types
plt.figure(figsize=(12, 6))
sns.barplot(x=df_waterpoint_counts["Waterpoint Type"], y=df_waterpoint_counts["Count"], palette="Blues_r")
plt.xlabel("Waterpoint Type")
plt.ylabel("Count")
plt.title("Distribution of Waterpoint Types in Training Data")
plt.xticks(rotation=45)
plt.show()
/var/folders/m7/tzk7gpbd05j79z_hsb25z6jm0000gn/T/ipykernel_1131/106240160.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=df_waterpoint_counts["Waterpoint Type"], y=df_waterpoint_counts["Count"], palette="Blues_r")
In [ ]:
original_waterpoint_labels = X_train['waterpoint_type'].value_counts()
original_waterpoint_labels
Out[ ]:
waterpoint_type 1 22785 4 14073 6 5098 2 4830 5 639 0 91 3 4 Name: count, dtype: int64
In [ ]:
waterpoint_mapping = {
0: "Shallow Well",
1: "Borehole",
2: "Protected Spring",
3: "Rainwater Harvesting",
4: "Standpipe",
5: "Unprotected Spring",
6: "Other"
}
In [ ]:
X_train['waterpoint_type_label'] = X_train['waterpoint_type'].map(waterpoint_mapping)
X_test['waterpoint_type_label'] = X_test['waterpoint_type'].map(waterpoint_mapping)
X_train['waterpoint_type_label'].value_counts()
Out[ ]:
waterpoint_type_label Borehole 22785 Standpipe 14073 Other 5098 Protected Spring 4830 Unprotected Spring 639 Shallow Well 91 Rainwater Harvesting 4 Name: count, dtype: int64
In [ ]: